import warnings
warnings.filterwarnings('ignore')
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import time
import datetime
import tqdm
from typing import List, Dict, Tuple
from joblib import Parallel, delayed
import pandas as pd
import pandas_market_calendars as mcal
from pandas_datareader import data as web
from pandas_datareader._utils import RemoteDataError
import numpy as np
import quantstats as qs
import matplotlib.pylab as plt
import matplotlib.cm as cm
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn.cluster import AgglomerativeClustering, DBSCAN, KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE, locally_linear_embedding
import umap
from sklearn.metrics import silhouette_samples, silhouette_score
from scipy.cluster.hierarchy import dendrogram, linkage, cophenet
from scipy.spatial.distance import pdist
from sklearn.covariance import LedoitWolf, ShrunkCovariance
sns.set(font_scale=1.025, rc={'figure.figsize': (10, 6),
'axes.facecolor': '#f2f2f0',
'axes.edgecolor': '#f2f2f0',
'figure.facecolor': '#f2f2f0',
'grid.color': '#c4c4c4',
'grid.linewidth': 0.5,
'lines.linewidth': 1.5,
'text.color': '#000000',
'xtick.color': '#000000',
'ytick.color': '#000000'})
def dt_to_string(dt: datetime.datetime) -> str:
return f'{dt:%Y-%m-%d}'
def triag_corr(df: pd.DataFrame, fig_xy: list = [13,10], cmap: str = 'rocket', annot: bool = True, annot_s: int = 8,
fmt: str = ".3%", abs_: bool = False, method: str = 'pearson') -> plt.figure:
plt.figure(figsize=(fig_xy[0], fig_xy[1]))
corr = df.corr(method=method).abs() if abs_ else df.corr(method=method)
mask = np.tri(*corr.shape).T
sns.heatmap(corr, mask=mask, cmap=cmap, annot=annot, annot_kws={"size":annot_s}, fmt=fmt)
plt.show()
def get_redundant_pairs(df: pd.DataFrame) -> set:
pairs_to_drop = set()
cols = df.columns
for i in range(0, df.shape[1]):
for j in range(0, i+1):
pairs_to_drop.add((cols[i], cols[j]))
return pairs_to_drop
def mu_std_corr_matrix(corr: pd.DataFrame or np.array, rnd: int = 3):
corr = corr.abs().copy()
corr.values[np.tril_indices_from(corr)] = np.nan
return (round(corr.unstack().mean(),rnd), round(corr.unstack().std(),rnd))
def get_top_abs_correlations(df: pd.DataFrame, n: int = 5) -> pd.Series:
au_corr = df.corr().abs().unstack()
labels_to_drop = get_redundant_pairs(df)
au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)\
.to_frame().rename(columns={0:'Correlation'}).rename_axis(['X', 'y'])
return au_corr[0:n]
def score_pca_silhouette(X: np.array, n_comp: int = 2, k: int = 3, rs: int = 1) -> set:
X = PCA(n_components=n_comp).fit_transform(X.values)
clusterer = KMeans(n_clusters=k, random_state=rs)
# clusterer = KMedoids(n_clusters=k, random_state=rs)
cluster_labels = clusterer.fit_predict(X)
silhouette_avg = silhouette_score(X, cluster_labels)
return (silhouette_avg, k)
def find_pca_k(X: np.array, n_comp: int, n_clusters: list, rs: int = 1,
n_jobs: int = -1, V: int = 5) -> pd.DataFrame:
out = Parallel(n_jobs=n_jobs, verbose=V)\
(delayed(score_pca_silhouette)\
(X, n_comp, k, rs) for k in n_clusters)
result_df = pd.concat([
pd.DataFrame(out[i]).T for i in range(len(out))
], axis=0).rename(columns={0:'Silhouette_Score', 1:'K'}).reset_index(drop=True)\
.sort_values('Silhouette_Score', ascending=False)
return result_df
def score_tsne_silhouette(X: np.array, n_comp: int = 2, k: int = 3, p: int = 50, iters: int = 1000, rs: int = 1) -> set:
X = TSNE(n_components=n_comp, perplexity=p, n_jobs=-1, n_iter=iters, random_state=rs).fit_transform(X.values)
clusterer = KMeans(n_clusters=k, random_state=rs)
cluster_labels = clusterer.fit_predict(X)
silhouette_avg = silhouette_score(X, cluster_labels)
return (silhouette_avg, k, p, iters)
def find_tsne_k(X: np.array, n_comp: int, n_clusters: list, perps: list, iters: list, rs: int = 1,
n_jobs: int = -1, V: int = 5) -> pd.DataFrame:
out = Parallel(n_jobs=n_jobs, verbose=V)\
(delayed(score_tsne_silhouette)\
(X, n_comp, k, p, i, rs) for k in n_clusters for p in perps for i in iters)
result_df = pd.concat([
pd.DataFrame(out[i]).T for i in range(len(out))
], axis=0).rename(columns={0:'Silhouette_Score', 1:'K', 2:'Perplexity', 3:'N_Iter'})\
.reset_index(drop=True).sort_values('Silhouette_Score', ascending=False)
return result_df
def score_umap_silhouette(X: np.array, n: int = 2, dist: float = 0.1, k: int = 3, rs: int = 1) -> set:
X = umap.UMAP(n_neighbors=n, min_dist=dist).fit_transform(X)
clusterer = KMeans(n_clusters=k, random_state=rs)
cluster_labels = clusterer.fit_predict(X)
silhouette_avg = silhouette_score(X, cluster_labels)
return (silhouette_avg, k, n, dist)
def find_umap_k(X: np.array, neighbors: list, dist: list, n_clusters: list, rs: int = 1,
n_jobs: int = -1, V: int = 5) -> pd.DataFrame:
out = Parallel(n_jobs=n_jobs, verbose=V)\
(delayed(score_umap_silhouette)\
(X, n, d, k, rs) for n in neighbors for d in dists for k in n_clusters)
result_df = pd.concat([
pd.DataFrame(out[i]).T for i in range(len(out))
], axis=0).rename(columns={0:'Silhouette_Score', 1:'K', 2:'N_Neighbors', 3:'Min_Dist'})\
.reset_index(drop=True).sort_values('Silhouette_Score', ascending=False)
return result_df
def score_lle_silhouette(X: np.array, n_comp: int = 2, n: int = 2, k: int = 3, rs: int = 1) -> set:
X, _ = locally_linear_embedding(X, n_components=n_comp, n_neighbors=n, method='standard', n_jobs=-1)
clusterer = KMeans(n_clusters=k, random_state=rs)
cluster_labels = clusterer.fit_predict(X)
silhouette_avg = silhouette_score(X, cluster_labels)
return (silhouette_avg, k, n)
def find_lle_k(X: np.array, n_comp: int, neighbors: list, n_clusters: list, rs: int = 1,
n_jobs: int = -1, V: int = 5) -> pd.DataFrame:
out = Parallel(n_jobs=n_jobs, verbose=V)\
(delayed(score_lle_silhouette)\
(X, n_comp, n, k, rs) for n in neighbors for k in n_clusters)
result_df = pd.concat([
pd.DataFrame(out[i]).T for i in range(len(out))
], axis=0).rename(columns={0:'Silhouette_Score', 1:'K', 2:'N_Neighbors'})\
.reset_index(drop=True).sort_values('Silhouette_Score', ascending=False)
return result_df
def get_symbols(symbols: List[str], begin_date: datetime or Timestamp, end_date: datetime or Timestamp,
trans: bool = False, data_source: str = 'yahoo', verbose: bool = False) -> pd.DataFrame:
out = pd.DataFrame()
for symbol in tqdm.tqdm(symbols):
attempts, fetched = 0, False
while not fetched:
try:
df = web.DataReader(symbol, data_source, begin_date, end_date)\
[['Open','High','Low','Adj Close','Volume']]
if trans:
df = df.apply(np.log1p).reset_index()
else:
df['Volume'] = df['Volume'].apply(np.log1p)
df = df.reset_index()
df.columns = ['date','open','high','low','close','volume']
df['symbol'] = symbol
df = df.set_index(['date','symbol'])
# print(f'{symbol}: {df.shape}')
out = pd.concat([out, df], axis=0)
except (RemoteDataError) as err:
print(f'\n{err}')
print(f'\nTrying again in 3 seconds.')
time.sleep(3)
attempts += 1
if attempts > 3:
print('\nFailed after 3 attemps, fetching aborted!!!')
break
except (KeyError) as err:
print(f'\n{err}')
pass
else:
fetched = True
return out.sort_index().dropna()
start, end = datetime.datetime(2014, 1, 1), datetime.datetime(2021, 8, 27)
tickers = ['AAPL', 'ABT', 'AGG', 'AMC', 'AMD', 'ATVI', 'BND', 'CAT', 'CMCSA', 'QQQ',
'CVX', 'DIS', 'EA', 'EMLP', 'F', 'FB', 'GDX', 'GLD', 'GM', 'GME', 'SYY',
'GOOGL', 'HD', 'HSY', 'HYG', 'IBB', 'IBM', 'INTC', 'IYR', 'JNJ', 'JNK',
'JPM', 'KO', 'KXI', 'LQD', 'MA', 'MSFT', 'NFLX', 'ORCL', 'PEP', 'PFE',
'PG', 'SAP', 'SHY', 'SLV', 'SPY', 'T', 'TGT', 'TM', 'TMF', 'TMO', 'URE',
'TSLA', 'TSM', 'TTWO', 'UNH', 'UPRO', 'USO', 'V', 'VDC', 'VDE', 'VEU',
'VHT', 'VNQ', 'VNQI', 'VOO', 'VSS', 'VZ', 'WFC', 'WMT', 'XLE', 'XOM',
'AMZN', 'NVDA', 'M', 'BAC', 'ASML', 'ADBE', 'NKE', 'CRM', 'MCD', 'HON',
'BX', 'GS', 'SBUX', 'BA', 'EL', 'AMAT', 'FDX', 'DE', 'MMM', 'BKNG', 'COF',
'GDXJ', 'MCO', 'UPS', 'SO', 'ADSK', 'D', 'NOC', 'LHX', 'ANTM', 'DG', 'FTNT',
'ORLY', 'WCN', 'AZO', 'EFX', 'VLO', 'HRL', 'KMX', 'TXN']
if len(tickers) == len(set(tickers)): print(f'{len(tickers)} Stocks')
112 Stocks
ohlcv_df = get_symbols(
symbols=tickers,
begin_date=start,
end_date=end,
trans=True,
verbose=False
)
ohlcv_df.to_csv('./Data/112_ohlcv_stock_etf_mix_2014-01-01-2021-08-31.csv')
ohlcv_df = pd.read_csv(
'./Data/112_ohlcv_stock_etf_mix_2014-01-01-2021-08-31.csv',
index_col = [0,1]
)
ohlcv_df['INTD_CH'] = (ohlcv_df.groupby(level='symbol').close.shift(0)\
- ohlcv_df.groupby(level='symbol').open.shift(0))\
/ ohlcv_df.groupby(level='symbol').open.shift(0)
ohlcv_df['CLS_INTD_CH'] = ohlcv_df.close * ohlcv_df['INTD_CH']
ohlcv_df['CLS_INTD_CH_DERV'] = np.gradient(ohlcv_df['CLS_INTD_CH'].values)
ohlcv_df.info(verbose=False)
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 215889 entries, ('2014-01-02', 'AAPL') to ('2021-08-27', 'XOM')
Columns: 8 entries, open to CLS_INTD_CH_DERV
dtypes: float64(8)
memory usage: 13.9+ MB
print(ohlcv_df.isna().sum()[ohlcv_df.isna().sum() > 0].to_frame().T)
Empty DataFrame Columns: [] Index: [0]
ohlcv_df.head()
| open | high | low | close | volume | INTD_CH | CLS_INTD_CH | CLS_INTD_CH_DERV | ||
|---|---|---|---|---|---|---|---|---|---|
| date | symbol | ||||||||
| 2014-01-02 | AAPL | 3.037148 | 3.039459 | 3.030858 | 2.921464 | 19.273754 | -0.038090 | -0.111278 | -0.027892 |
| ABT | 3.665867 | 3.673766 | 3.663562 | 3.520969 | 15.418427 | -0.039526 | -0.139170 | 0.057552 | |
| ADBE | 4.095344 | 4.103139 | 4.093344 | 4.099166 | 14.825620 | 0.000933 | 0.003826 | 0.068790 | |
| ADSK | 3.918601 | 3.926715 | 3.909620 | 3.917011 | 14.726990 | -0.000406 | -0.001590 | -0.088667 | |
| AGG | 4.677026 | 4.678142 | 4.676839 | 4.496552 | 14.113094 | -0.038587 | -0.173509 | -0.056817 |
closes = ohlcv_df.close.unstack().dropna()
print(len(closes[closes.isin([np.nan, np.inf, -np.inf]).any(1)]))
closes_portfolio = closes / closes.iloc[0, :]
eq_portfolio_returns = closes_portfolio.pct_change().dropna().mean(axis=1)
eq_portfolio = (closes_portfolio.pct_change().dropna().mean(axis=1) + 1).cumprod()
0
colors = cm.rainbow(np.linspace(0, 1, len(closes.columns)))
fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(16,8), sharex=True)
closes_portfolio.plot(ax=ax[0], color=colors, legend=False);
closes_portfolio.plot(alpha=0.4, ax=ax[1], color=colors, legend=False)
eq_portfolio.plot(label='Eq Wt Portfolio', color='black', ax=ax[1])
fig.legend(loc=8, ncol=12, bbox_to_anchor=(0.5, 0.99), title='All Stocks Normalized Price Series')
ax[0].margins(0.01, 0.01)
ax[1].margins(0.01, 0.01)
plt.tight_layout()
plt.show()
returns = closes.pct_change().dropna()
fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(16,10), sharex=True)
returns.drop('SPY', axis=1).rolling(63).corr(returns.SPY).dropna().plot(color=colors, ax=ax[0])
ax[0].legend(loc=8, ncol=12, bbox_to_anchor=(0.5, 0.99), title='Rolling Quarterly Correlation to SPY')
returns.rolling(63).var().dropna().plot(color=colors, legend=False, ax=ax[1])
ax[1].set_title('Rolling Quarterly Variance')
ax[0].margins(0.01, 0.01)
ax[1].margins(0.01, 0.01)
plt.tight_layout()
plt.show()
stock_profile_df = (ohlcv_df.close.unstack().pct_change().mean() * 252).to_frame('Returns')
stock_profile_df['Volatility'] = ohlcv_df.close.unstack().pct_change().std() * 252
stock_profile_df['Sharpe_ratio'] = (stock_profile_df['Returns'] / stock_profile_df['Volatility']) * np.sqrt(252)
stock_profile_df['Volume'] = ohlcv_df.volume.unstack().mean() * 252
stock_profile_df['Intraday_Ch_Derv'] = ohlcv_df.CLS_INTD_CH_DERV.unstack().mean() * 252
stock_profile_df = stock_profile_df[~stock_profile_df.isin([np.nan, np.inf, -np.inf]).any(1)]
stock_profile_df.head()
| Returns | Volatility | Sharpe_ratio | Volume | Intraday_Ch_Derv | |
|---|---|---|---|---|---|
| symbol | |||||
| AAPL | 0.073136 | 1.154958 | 1.005236 | 4733.392217 | 15.753683 |
| ABT | 0.043033 | 0.900066 | 0.758967 | 3937.044287 | 5.747411 |
| ADBE | 0.061731 | 0.899876 | 1.088979 | 3730.510740 | 8.311963 |
| ADSK | 0.053127 | 1.176709 | 0.716718 | 3641.698490 | -10.971822 |
| AGG | 0.007569 | 0.141907 | 0.846718 | 3766.241927 | -6.375048 |
n_bins = int(np.sqrt(len(stock_profile_df)))
for col in stock_profile_df.columns:
tmp_desc = stock_profile_df[[col]].describe().T
tmp_desc['skew'] = stock_profile_df[col].skew()
print(tmp_desc)
plt.figure(figsize=(15,4))
plt.subplot(1,2,1)
stock_profile_df[col].hist(bins=n_bins, grid=False)
plt.ylabel('count')
plt.subplot(1,2,2)
sns.boxplot(x=stock_profile_df[col])
plt.show()
count mean std min 25% 50% 75% \
Returns 112.0 0.036012 0.030193 -0.042998 0.019423 0.0307 0.044773
max skew
Returns 0.171388 1.853383
count mean std min 25% 50% 75% \
Volatility 112.0 1.099252 0.847363 0.034033 0.718085 0.899971 1.232404
max skew
Volatility 6.940179 4.12298
count mean std min 25% 50% \
Sharpe_ratio 112.0 0.576929 0.306338 -0.438121 0.375831 0.582165
75% max skew
Sharpe_ratio 0.784297 1.342797 -0.337606
count mean std min 25% 50% \
Volume 112.0 3822.127256 382.5093 2664.122031 3612.819701 3841.296549
75% max skew
Volume 4033.530103 4733.392217 -0.469879
count mean std min 25% 50% \
Intraday_Ch_Derv 112.0 -0.000052 9.573548 -25.546448 -7.369433 0.012058
75% max skew
Intraday_Ch_Derv 7.041744 23.397123 -0.025163
og_stock_profile_df = stock_profile_df.copy()
X = stock_profile_df.copy()
X.Volatility = X.Volatility.apply(np.log1p)
X = pd.DataFrame(
StandardScaler().fit_transform(X),
index=og_stock_profile_df.index,
columns=og_stock_profile_df.columns
)
X.head()
| Returns | Volatility | Sharpe_ratio | Volume | Intraday_Ch_Derv | |
|---|---|---|---|---|---|
| symbol | |||||
| AAPL | 1.235090 | 0.263525 | 1.404435 | 2.393041 | 1.652944 |
| ABT | 0.233572 | -0.175108 | 0.596911 | 0.301780 | 0.603046 |
| ADBE | 0.855640 | -0.175456 | 1.679034 | -0.240591 | 0.872129 |
| ADSK | 0.569410 | 0.298520 | 0.458374 | -0.473818 | -1.151201 |
| AGG | -0.946253 | -1.949358 | 0.884648 | -0.146759 | -0.668890 |
X.info()
<class 'pandas.core.frame.DataFrame'> Index: 112 entries, AAPL to XOM Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Returns 112 non-null float64 1 Volatility 112 non-null float64 2 Sharpe_ratio 112 non-null float64 3 Volume 112 non-null float64 4 Intraday_Ch_Derv 112 non-null float64 dtypes: float64(5) memory usage: 5.2+ KB
for col in stock_profile_df.columns:
tmp_desc = X[[col]].describe().T
tmp_desc['skew'] = X[col].skew()
print(tmp_desc)
plt.figure(figsize=(15,4))
plt.subplot(1,2,1)
X[col].hist(bins=n_bins, grid=False)
plt.ylabel('count')
plt.subplot(1,2,2)
sns.boxplot(x=X[col])
plt.show()
count mean std min 25% 50% \
Returns 112.0 -1.566207e-16 1.004494 -2.628544 -0.551893 -0.176727
75% max skew
Returns 0.291484 4.503789 1.853383
count mean std min 25% 50% \
Volatility 112.0 5.848496e-17 1.004494 -2.295129 -0.525918 -0.175282
75% max skew
Volatility 0.386441 4.807839 1.654504
count mean std min 25% 50% \
Sharpe_ratio 112.0 8.723181e-17 1.004494 -3.328388 -0.659407 0.017168
75% max skew
Sharpe_ratio 0.679968 2.51131 -0.337606
count mean std min 25% 50% 75% \
Volume 112.0 2.091581e-16 1.004494 -3.040997 -0.549655 0.05034 0.555158
max skew
Volume 2.393041 -0.469879
count mean std min 25% 50% \
Intraday_Ch_Derv 112.0 3.965082e-18 1.004494 -2.680429 -0.773225 0.001271
75% max skew
Intraday_Ch_Derv 0.738853 2.454924 -0.025163
triag_corr(
df=X,
fig_xy=[16,8],
abs_=False,
cmap='viridis',
method='pearson',
fmt='.2%',
annot_s=14
)
mu_abs_corr, std_abs_corr = mu_std_corr_matrix(corr=X.corr(), rnd=3)
top_abs_corrs = get_top_abs_correlations(X, 20)
abs_corrs_idx = [str(i)+' :: '+str(x) for i, x in zip(top_abs_corrs.index.droplevel(1), top_abs_corrs.index.droplevel(0))]
top_abs_corrs = pd.DataFrame(top_abs_corrs.values, index=abs_corrs_idx, columns=['Correlation'])
fig, ax = plt.subplots(figsize=(8,8))
top_abs_corrs.sort_values('Correlation').plot(kind='barh', ax=ax, legend=False)
for i, v in enumerate(top_abs_corrs.sort_values('Correlation').values):
ax.text(x=v + 0.045, y=i, s=str(round(v[0],3)), fontweight='medium', fontsize=12,
va='center', ha='right')
plt.title(f'Top 20 Absolute Correlated Pairs \nMu: {mu_abs_corr} (+/-) {std_abs_corr}', fontsize=14)
plt.xlabel('Pearson Correlation', fontsize=14);
sns.pairplot(X, diag_kind='kde', corner=True);
sse = {}
for k in range(1, 10):
kmeans = KMeans(n_clusters=k, random_state=1).fit(X.values)
sse[k] = kmeans.inertia_
plt.figure(figsize=(12,6))
plt.plot(list(sse.keys()), list(sse.values()), 'o-')
plt.xlabel("Number of cluster")
plt.ylabel("SSE")
plt.show()
neighbors = [2, 3, 5, 7, 10, 15, 20, 30, 40, 50, 60, 65]
colors = cm.rainbow(np.linspace(0, 1, len(neighbors)))
fig, axs = plt.subplots(figsize=(14,24))
for i, n in enumerate(neighbors, 1):
plt.subplot(len(neighbors), 4, i)
embedding, _ = locally_linear_embedding(X.values, n_components=2, n_neighbors=n, method='standard', n_jobs=-1)
plt.scatter(embedding[:, 0], embedding[:, 1], s=40, color=colors[i-1], alpha=0.7, edgecolors='k', lw=0.75)
plt.title(f'N Neighbors: {n:.0f}')
plt.axis('off')
fig.tight_layout()
plt.show()
lle_n_neighbors = list(range(10, 61, 10))
n_neighbors = [3, 5] + list(range(10, 31, 10))
dists = [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
colors = cm.rainbow(np.linspace(0, 1, len(dists)))
for n in n_neighbors:
fig, axs = plt.subplots(figsize=(14,24))
for i, d in enumerate(dists, 1):
plt.subplot(len(dists), 4, i)
embedding=umap.UMAP(n_neighbors=n, min_dist=d).fit_transform(X.values)
plt.scatter(embedding[:, 0], embedding[:, 1], s=40, color=colors[i-1], alpha=0.7, edgecolors='k', lw=0.75)
plt.title(f'Min. Distance: {d:.3f}')
plt.axis('off')
fig.suptitle(f'N Neighbors: {n}', fontsize=16)
fig.tight_layout()
fig.subplots_adjust(top=0.945)
plt.show()
umap_n_neighbors = [5] + list(range(10, 61, 10))
umap_min_dist = [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
perps = [2, 3, 5] + list(range(10, 31, 10))
iters = [250, 300, 500, 1000, 2000, 3000, 4000, 5000]
colors = cm.rainbow(np.linspace(0, 1, len(iters)))
for p in perps:
fig, axs = plt.subplots(figsize=(14,24))
for i, n in enumerate(iters, 1):
plt.subplot(len(iters), 4, i)
embedding=TSNE(perplexity=p, n_iter=n, n_jobs=-1, random_state=1).fit_transform(X.values)
plt.scatter(embedding[:, 0], embedding[:, 1], s=40, color=colors[i-1], alpha=0.7, edgecolors='k', lw=0.75)
plt.title(f'Iterations: {n:.0f}')
plt.axis('off')
fig.suptitle(f'Perplexity: {p}', fontsize=16)
fig.tight_layout()
fig.subplots_adjust(top=0.945)
plt.show()
tsne_perplex = list(range(5, 51, 5))
tsne_iters = [2000, 3000, 4000, 5000]
Here we use linear and non-linear dimensionality reduction techniques as well as the Kmeans & Hierarchical clustering methods to find the best set of parameters to maximize the silhouette coefficient for our stock universe.
n_clusters = list(range(3, 11))
print(f'n_clusters: {n_clusters}')
pca_res_df = find_pca_k(
X=X,
n_comp=2,
n_clusters=n_clusters,
rs=1,
n_jobs=-1,
V=5
)
print(f'tsne_perplex: {tsne_perplex}')
print(f'tsne_iters: {tsne_iters}')
tsne_res_df = find_tsne_k(
X=X,
n_comp=2,
n_clusters=n_clusters,
iters=tsne_iters,
perps=tsne_perplex,
rs=1,
n_jobs=-1,
V=5
)
print(f'umap_n_neighbors: {umap_n_neighbors}')
print(f'umap_min_dist: {umap_min_dist}')
umap_res_df = find_umap_k(
X=X,
neighbors=umap_n_neighbors,
dist=umap_min_dist,
n_clusters=n_clusters,
rs=1,
n_jobs=-1,
V=5
)
print(f'lle_n_neighbors: {lle_n_neighbors}')
lle_res_df = find_lle_k(
X=X,
n_comp=2,
neighbors=lle_n_neighbors,
n_clusters=n_clusters,
rs=1,
n_jobs=-1,
V=5
)
n_clusters: [3, 4, 5, 6, 7, 8, 9, 10]
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers. [Parallel(n_jobs=-1)]: Done 3 out of 8 | elapsed: 1.5s remaining: 2.5s [Parallel(n_jobs=-1)]: Done 5 out of 8 | elapsed: 1.5s remaining: 0.9s [Parallel(n_jobs=-1)]: Done 8 out of 8 | elapsed: 1.5s finished [Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
tsne_perplex: [5, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50] tsne_iters: [2000, 3000, 4000, 5000]
[Parallel(n_jobs=-1)]: Done 2 tasks | elapsed: 1.3s [Parallel(n_jobs=-1)]: Done 56 tasks | elapsed: 6.9s [Parallel(n_jobs=-1)]: Done 146 tasks | elapsed: 17.1s [Parallel(n_jobs=-1)]: Done 272 tasks | elapsed: 31.9s [Parallel(n_jobs=-1)]: Done 352 out of 352 | elapsed: 41.0s finished [Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
umap_n_neighbors: [5, 10, 20, 30, 40, 50, 60] umap_min_dist: [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
[Parallel(n_jobs=-1)]: Done 2 tasks | elapsed: 10.9s [Parallel(n_jobs=-1)]: Done 56 tasks | elapsed: 29.7s [Parallel(n_jobs=-1)]: Done 146 tasks | elapsed: 1.1min [Parallel(n_jobs=-1)]: Done 272 tasks | elapsed: 2.0min [Parallel(n_jobs=-1)]: Done 448 out of 448 | elapsed: 3.2min finished [Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
lle_n_neighbors: [10, 20, 30, 40, 50, 60]
[Parallel(n_jobs=-1)]: Done 2 tasks | elapsed: 0.2s [Parallel(n_jobs=-1)]: Done 43 out of 48 | elapsed: 1.4s remaining: 0.2s [Parallel(n_jobs=-1)]: Done 48 out of 48 | elapsed: 1.7s finished
print('Best PCA:')
print(pca_res_df.head())
print('\nBest TSNE:')
print(tsne_res_df.head())
print('\nBest UMAP:')
print(umap_res_df.head())
print('\nBest LLE:')
print(lle_res_df.head())
Best PCA:
Silhouette_Score K
0 0.391819 3.0
5 0.383114 8.0
6 0.382631 9.0
4 0.380528 7.0
1 0.361966 4.0
Best TSNE:
Silhouette_Score K Perplexity N_Iter
220 0.532222 8.0 5.0 2000.0
224 0.532222 8.0 5.0 2000.0
226 0.506418 8.0 5.0 4000.0
227 0.506418 8.0 5.0 5000.0
223 0.506418 8.0 5.0 5000.0
Best UMAP:
Silhouette_Score K N_Neighbors Min_Dist
5 0.671127 8.0 5.0 0.001
13 0.658839 8.0 5.0 0.010
15 0.640580 10.0 5.0 0.010
10 0.637241 5.0 5.0 0.010
7 0.628794 10.0 5.0 0.001
Best LLE:
Silhouette_Score K N_Neighbors
24 0.393782 3.0 40.0
37 0.392690 8.0 50.0
16 0.391780 3.0 30.0
32 0.385605 3.0 50.0
31 0.382737 10.0 40.0
X_pca = PCA(
n_components=2
).fit_transform(X.values)
pca_k = 3
X_lle, _ = locally_linear_embedding(
X.values,
n_components=2,
n_neighbors=50,
method='standard',
n_jobs=-1
)
lle_k = 8
X_umap = umap.UMAP(
n_neighbors=5,
min_dist=0.001
).fit_transform(X.values)
umap_k = 8
X_tsne = TSNE(
n_components=2,
random_state=1,
perplexity=5,
n_iter=2000,
n_jobs=-1
).fit_transform(X.values)
tsne_k = 8
def show_silhouette(X: np.array, range_n_clusters: list, random_state: int = 1) -> pd.DataFrame:
silhouette_scores = {}
for n_clusters in range_n_clusters:
# Initialize the clusterer with n_clusters value and a random generator seed for reproducibility.
clusterer = KMeans(n_clusters=n_clusters, random_state=random_state)
cluster_labels = clusterer.fit_predict(X)
# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed clusters
silhouette_avg = silhouette_score(X, cluster_labels)
silhouette_scores[n_clusters] = silhouette_avg
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(X, cluster_labels)
return pd.DataFrame.from_dict(silhouette_scores, orient='index').rename(columns={0:'Avg_Silhouette_Score'})\
.rename_axis('k').sort_values('Avg_Silhouette_Score', ascending=False)
n_clusters = list(range(2, 10))
silhouette_scores = show_silhouette(
X=X.values,
range_n_clusters=n_clusters,
)
silhouette_scores.head(3)
| Avg_Silhouette_Score | |
|---|---|
| k | |
| 2 | 0.235949 |
| 9 | 0.218415 |
| 3 | 0.216407 |
kmeans = KMeans(n_clusters=9, random_state=1)
kmeans.fit(X)
stock_profile_df['KMeans_Labels'] = kmeans.predict(X)
labels = kmeans.predict(X)
colors = cm.rainbow(np.linspace(0, 1, stock_profile_df['KMeans_Labels'].nunique()))
stock_profile_df.KMeans_Labels.value_counts().plot(kind='barh', color=colors)
plt.title('Count of Unique Cluster Members')
plt.xlabel('Member Count')
plt.ylabel('Cluster ID');
plt.figure(1, facecolor='white', figsize=(10, 6))
plt.clf()
plt.axis('off')
scatter0 = plt.scatter(X.values[:,0], X.values[:,1], s=200, alpha=0.8,
c=labels[labels!=-1], cmap='rainbow', edgecolors='k', lw=0.75)
plt.legend(*scatter0.legend_elements(), title="Cluster", bbox_to_anchor=(1.09, 0.99))
plt.title('UMAP of all stocks with KMedoids clusters noted');
df_kmeans = stock_profile_df.groupby('KMeans_Labels').mean()
df_kmeans.T
| KMeans_Labels | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |
|---|---|---|---|---|---|---|---|---|---|
| Returns | 0.023505 | 0.037738 | 0.024542 | 0.069572 | 0.140713 | 0.021562 | 0.039367 | -0.002353 | 0.012441 |
| Volatility | 1.009593 | 0.915668 | 0.663810 | 1.389844 | 4.394597 | 0.664518 | 0.746932 | 1.958995 | 1.301234 |
| Sharpe_ratio | 0.372102 | 0.660445 | 0.608855 | 0.809834 | 0.661472 | 0.534426 | 0.861454 | -0.047219 | 0.141158 |
| Volume | 3500.033567 | 4214.185851 | 3875.378023 | 3903.491697 | 4041.980287 | 3143.869253 | 3633.827817 | 4089.377234 | 4105.366238 |
| Intraday_Ch_Derv | 4.494311 | 9.070455 | -9.805831 | -3.494516 | 9.900266 | -7.183060 | 3.369816 | 9.675080 | -10.518090 |
fig, axs = plt.subplots(ncols=2, nrows=1, figsize=(16,28))
plt.subplots_adjust(right=2)
plt.subplots_adjust(top=2)
for i, feature in enumerate(og_stock_profile_df.columns.to_list(), 1):
plt.subplot(len(stock_profile_df.columns), 2, i)
sns.boxplot(x='KMeans_Labels', y=feature, data=stock_profile_df, palette='husl')
plt.title(f'{feature}', size=15, fontsize=12)
plt.tight_layout()
plt.show()
Km_max_sharpe_cluster_pf = stock_profile_df[stock_profile_df.KMeans_Labels == 6].index.to_list()
print(Km_max_sharpe_cluster_pf)
['ADBE', 'AMZN', 'ANTM', 'ASML', 'DE', 'DG', 'EFX', 'GOOGL', 'HD', 'HON', 'LHX', 'MA', 'MCO', 'NOC', 'SHY', 'TMO', 'UNH', 'VOO', 'WCN']
cov = returns[Km_max_sharpe_cluster_pf].cov()
lw_cov = LedoitWolf().fit(cov)
rbust_cov = lw_cov.covariance_
sns.clustermap(pd.DataFrame(rbust_cov, index=cov.index, columns=cov.columns), figsize=(12,8), row_cluster=False)
plt.show()
n = rbust_cov.shape[1]
pca = PCA(n_components=n, random_state=1)
data_pca = pd.DataFrame(pca.fit_transform(rbust_cov))
exp_var = pca.explained_variance_ratio_
# visualize the Explained Individual Components
plt.figure(figsize = (13,6))
plt.plot(range(1,n+1), pca.explained_variance_ratio_.cumsum(), marker = 'o', linestyle = '--', color='blue')
plt.plot(range(1,n+1), pca.explained_variance_ratio_, marker = 'o', color='red')
plt.title("Cumulative & Explained Variances by Components", fontsize=14)
plt.xlabel("Number of Components", fontsize=14)
plt.ylabel("Variance", fontsize=14)
plt.legend(['Cumulative', 'Explained']);
topPCs = pd.DataFrame(pca.components_[:9], columns=cov.columns)
eigen_portfolios = topPCs.subtract(topPCs.min()).div(topPCs.max().subtract(topPCs.min()))
eigen_portfolios = eigen_portfolios.div(eigen_portfolios.sum(1), axis=0)
# eigen_portfolios = topPCs.div(topPCs.sum(1), axis=0)
eigen_portfolios.index = [f'Portfolio {i}' for i in range(1, topPCs.shape[0]+1)]
eigen_portfolios.T.plot.bar(subplots=True, figsize=(15,6), legend=False, sharex=True, layout=(3,3))
plt.tight_layout();
fig, axs = plt.subplots(figsize=(16,30))
n_pfs = topPCs.shape[0]+1
for i, c in zip(range(n_pfs), ['k', 'b', 'orangered', 'g', 'r', 'purple', 'brown', 'hotpink', 'grey', 'y']):
plt.subplot(n_pfs, 3, i+1)
if i == 0:
tmp = returns.SPY.cumsum()
sr = (returns.SPY.mean() / returns.SPY.std()) * np.sqrt(252)
tmp.plot(title=f'The Market \nSharpe Ratio: {round(sr,3)}', color=c)
else:
tmp = returns[Km_max_sharpe_cluster_pf].mul(eigen_portfolios.iloc[i-1]).sum(1)
sr = (tmp.mean() / tmp.std()) * np.sqrt(252)
tmp.cumsum().plot(color=c, title=f'Portfolio {i} \nSharpe Ratio: {round(sr,3)}')
plt.tight_layout()
plt.show()
kmeans = KMeans(n_clusters=pca_k, random_state=1)
kmeans.fit(X_pca)
stock_profile_df['PCA_KMeans_Labels'] = kmeans.predict(X_pca)
labels = kmeans.predict(X_pca)
colors = cm.rainbow(np.linspace(0, 1, stock_profile_df['PCA_KMeans_Labels'].nunique()))
stock_profile_df.PCA_KMeans_Labels.value_counts().plot(kind='barh', color=colors)
plt.title('Count of Unique Cluster Members')
plt.xlabel('Member Count')
plt.ylabel('Cluster ID');
plt.figure(1, facecolor='white', figsize=(10, 6))
plt.clf()
plt.axis('off')
scatter0 = plt.scatter(X_pca[:,0], X_pca[:,1], s=200, alpha=0.8,
c=labels[labels!=-1], cmap='rainbow', edgecolors='k', lw=0.75)
plt.legend(*scatter0.legend_elements(), title="Cluster", bbox_to_anchor=(1.09, 0.99))
plt.title('PCA of all stocks with KMedoids clusters noted');
df_kmeans = stock_profile_df.groupby('PCA_KMeans_Labels').mean()
df_kmeans.T
| PCA_KMeans_Labels | 0 | 1 | 2 |
|---|---|---|---|
| Returns | 0.006575 | 0.029315 | 0.077744 |
| Volatility | 1.499422 | 0.765512 | 1.880133 |
| Sharpe_ratio | 0.067231 | 0.629326 | 0.765196 |
| Volume | 4046.594185 | 3706.825859 | 4031.932955 |
| Intraday_Ch_Derv | -3.582743 | -1.026779 | 5.750998 |
| KMeans_Labels | 7.187500 | 2.876712 | 2.565217 |
fig, axs = plt.subplots(ncols=2, nrows=1, figsize=(16,28))
plt.subplots_adjust(right=2)
plt.subplots_adjust(top=2)
for i, feature in enumerate(og_stock_profile_df.columns.to_list(), 1):
plt.subplot(len(stock_profile_df.columns), 2, i)
sns.boxplot(x='PCA_KMeans_Labels', y=feature, data=stock_profile_df, palette='husl')
plt.title(f'{feature}', size=15, fontsize=12)
plt.tight_layout()
plt.show()
PCA_max_sharpe_cluster_pf = stock_profile_df[stock_profile_df.PCA_KMeans_Labels == 2].index.to_list()
print(PCA_max_sharpe_cluster_pf)
['AAPL', 'ADBE', 'AMAT', 'AMC', 'AMD', 'ATVI', 'BAC', 'BX', 'EA', 'FB', 'FTNT', 'GME', 'INTC', 'JPM', 'MSFT', 'NFLX', 'NVDA', 'TGT', 'TMF', 'TSLA', 'TSM', 'TTWO', 'UPRO']
cov = returns[PCA_max_sharpe_cluster_pf].cov()
lw_cov = LedoitWolf().fit(cov)
rbust_cov = lw_cov.covariance_
sns.clustermap(pd.DataFrame(rbust_cov, index=cov.index, columns=cov.columns), figsize=(12,8), row_cluster=False)
plt.show()
n = rbust_cov.shape[1]
pca = PCA(n_components=n, random_state=1)
data_pca = pd.DataFrame(pca.fit_transform(rbust_cov))
exp_var = pca.explained_variance_ratio_
# visualize the Explained Individual Components
plt.figure(figsize = (13,6))
plt.plot(range(1,n+1), pca.explained_variance_ratio_.cumsum(), marker = 'o', linestyle = '--', color='blue')
plt.plot(range(1,n+1), pca.explained_variance_ratio_, marker = 'o', color='red')
plt.title("Cumulative & Explained Variances by Components", fontsize=14)
plt.xlabel("Number of Components", fontsize=14)
plt.ylabel("Variance", fontsize=14)
plt.legend(['Cumulative', 'Explained']);
topPCs = pd.DataFrame(pca.components_[:9], columns=cov.columns)
eigen_portfolios = topPCs.subtract(topPCs.min()).div(topPCs.max().subtract(topPCs.min()))
eigen_portfolios = eigen_portfolios.div(eigen_portfolios.sum(1), axis=0)
# eigen_portfolios = topPCs.div(topPCs.sum(1), axis=0)
eigen_portfolios.index = [f'Portfolio {i}' for i in range(1, topPCs.shape[0]+1)]
eigen_portfolios.T.plot.bar(subplots=True, figsize=(15,6), legend=False, sharex=True, layout=(3,3))
plt.tight_layout();
fig, axs = plt.subplots(figsize=(16,30))
n_pfs = topPCs.shape[0]+1
for i, c in zip(range(n_pfs), ['k', 'b', 'orangered', 'g', 'r', 'purple', 'brown', 'hotpink', 'grey', 'y']):
plt.subplot(n_pfs, 3, i+1)
if i == 0:
tmp = returns.SPY.cumsum()
sr = (returns.SPY.mean() / returns.SPY.std()) * np.sqrt(252)
tmp.plot(title=f'The Market \nSharpe Ratio: {round(sr,3)}', color=c)
else:
tmp = returns[PCA_max_sharpe_cluster_pf].mul(eigen_portfolios.iloc[i-1]).sum(1)
sr = (tmp.mean() / tmp.std()) * np.sqrt(252)
tmp.cumsum().plot(color=c, title=f'Portfolio {i} \nSharpe Ratio: {round(sr,3)}')
plt.tight_layout()
plt.show()
kmeans = KMeans(n_clusters=umap_k, random_state=1)
kmeans.fit(X_umap)
stock_profile_df['UMAP_KMeans_Labels'] = kmeans.predict(X_umap)
labels = kmeans.predict(X_umap)
colors = cm.rainbow(np.linspace(0, 1, stock_profile_df['UMAP_KMeans_Labels'].nunique()))
stock_profile_df.UMAP_KMeans_Labels.value_counts().plot(kind='barh', color=colors)
plt.title('Count of Unique Cluster Members')
plt.xlabel('Member Count')
plt.ylabel('Cluster ID');
plt.figure(1, facecolor='white', figsize=(10, 6))
plt.clf()
plt.axis('off')
scatter0 = plt.scatter(X_umap[:,0], X_umap[:,1], s=200, alpha=0.8,
c=labels[labels!=-1], cmap='rainbow', edgecolors='k', lw=0.75)
plt.legend(*scatter0.legend_elements(), title="Cluster", bbox_to_anchor=(1.09, 0.99))
plt.title('UMAP of all stocks with KMedoids clusters noted');
df_kmeans = stock_profile_df.groupby('UMAP_KMeans_Labels').mean()
df_kmeans.T
| UMAP_KMeans_Labels | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
|---|---|---|---|---|---|---|---|---|
| Returns | 0.037555 | 0.022711 | 0.093825 | 0.007510 | 0.038789 | 0.018251 | 0.043422 | 0.031867 |
| Volatility | 0.903528 | 0.945950 | 2.321256 | 1.520488 | 0.659630 | 0.541438 | 0.924982 | 0.714157 |
| Sharpe_ratio | 0.664399 | 0.389229 | 0.774091 | 0.078366 | 0.962368 | 0.580786 | 0.764208 | 0.696077 |
| Volume | 4138.275552 | 3443.281643 | 3956.403331 | 4100.036570 | 3602.547543 | 3904.870679 | 3835.832203 | 3278.043204 |
| Intraday_Ch_Derv | 8.785843 | 1.967538 | 1.453674 | -3.787034 | 6.670867 | -9.625828 | -9.153052 | -6.441583 |
| KMeans_Labels | 1.608696 | 1.400000 | 3.285714 | 7.666667 | 6.000000 | 2.000000 | 3.100000 | 5.444444 |
| PCA_KMeans_Labels | 1.304348 | 1.000000 | 2.000000 | 0.000000 | 1.111111 | 1.000000 | 1.000000 | 1.000000 |
fig, axs = plt.subplots(ncols=2, nrows=1, figsize=(16,28))
plt.subplots_adjust(right=2)
plt.subplots_adjust(top=2)
for i, feature in enumerate(og_stock_profile_df.columns.to_list(), 1):
plt.subplot(len(stock_profile_df.columns), 2, i)
sns.boxplot(x='UMAP_KMeans_Labels', y=feature, data=stock_profile_df, palette='husl')
plt.title(f'{feature}', size=15, fontsize=12)
plt.tight_layout()
plt.show()
UMAP_max_sharpe_cluster_pf = stock_profile_df[stock_profile_df.UMAP_KMeans_Labels == 4].index.to_list()
print(UMAP_max_sharpe_cluster_pf)
['ADBE', 'ASML', 'GOOGL', 'NOC', 'SHY', 'TMO', 'UNH', 'VOO', 'WCN']
cov = returns[UMAP_max_sharpe_cluster_pf].cov()
lw_cov = LedoitWolf().fit(cov)
rbust_cov = lw_cov.covariance_
sns.clustermap(pd.DataFrame(rbust_cov, index=cov.index, columns=cov.columns), figsize=(12,8), row_cluster=False)
plt.show()
n = rbust_cov.shape[1]
pca = PCA(n_components=n, random_state=1)
data_pca = pd.DataFrame(pca.fit_transform(rbust_cov))
exp_var = pca.explained_variance_ratio_
# visualize the Explained Individual Components
plt.figure(figsize = (13,6))
plt.plot(range(1,n+1), pca.explained_variance_ratio_.cumsum(), marker = 'o', linestyle = '--', color='blue')
plt.plot(range(1,n+1), pca.explained_variance_ratio_, marker = 'o', color='red')
plt.title("Cumulative & Explained Variances by Components", fontsize=14)
plt.xlabel("Number of Components", fontsize=14)
plt.ylabel("Variance", fontsize=14)
plt.legend(['Cumulative', 'Explained']);
topPCs = pd.DataFrame(pca.components_[:9], columns=cov.columns)
eigen_portfolios = topPCs.subtract(topPCs.min()).div(topPCs.max().subtract(topPCs.min()))
eigen_portfolios = eigen_portfolios.div(eigen_portfolios.sum(1), axis=0)
# eigen_portfolios = topPCs.div(topPCs.sum(1), axis=0)
eigen_portfolios.index = [f'Portfolio {i}' for i in range(1, topPCs.shape[0]+1)]
eigen_portfolios.T.plot.bar(subplots=True, figsize=(15,6), legend=False, sharex=True, layout=(3,3))
plt.tight_layout();
fig, axs = plt.subplots(figsize=(16,30))
n_pfs = topPCs.shape[0]+1
for i, c in zip(range(n_pfs), ['k', 'b', 'orangered', 'g', 'r', 'purple', 'brown', 'hotpink', 'grey', 'y']):
plt.subplot(n_pfs, 3, i+1)
if i == 0:
tmp = returns.SPY.cumsum()
sr = (returns.SPY.mean() / returns.SPY.std()) * np.sqrt(252)
tmp.plot(title=f'The Market \nSharpe Ratio: {round(sr,3)}', color=c)
else:
tmp = returns[UMAP_max_sharpe_cluster_pf].mul(eigen_portfolios.iloc[i-1]).sum(1)
sr = (tmp.mean() / tmp.std()) * np.sqrt(252)
tmp.cumsum().plot(color=c, title=f'Portfolio {i} \nSharpe Ratio: {round(sr,3)}')
plt.tight_layout()
plt.show()
kmeans = KMeans(n_clusters=lle_k, random_state=1)
kmeans.fit(X_lle)
stock_profile_df['LLE_KMeans_Labels'] = kmeans.predict(X_lle)
labels = kmeans.predict(X_lle)
colors = cm.rainbow(np.linspace(0, 1, stock_profile_df['LLE_KMeans_Labels'].nunique()))
stock_profile_df.LLE_KMeans_Labels.value_counts().plot(kind='barh', color=colors)
plt.title('Count of Unique Cluster Members')
plt.xlabel('Member Count')
plt.ylabel('Cluster ID');
plt.figure(1, facecolor='white', figsize=(10, 6))
plt.clf()
plt.axis('off')
scatter0 = plt.scatter(X_lle[:,0], X_lle[:,1], s=200, alpha=0.8,
c=labels[labels!=-1], cmap='rainbow', edgecolors='k', lw=0.75)
plt.legend(*scatter0.legend_elements(), title="Cluster", bbox_to_anchor=(1.09, 0.99))
plt.title('LLE of all stocks with KMedoids clusters noted');
df_kmeans = stock_profile_df.groupby('LLE_KMeans_Labels').mean()
df_kmeans.T
| LLE_KMeans_Labels | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
|---|---|---|---|---|---|---|---|---|
| Returns | 0.065348 | 0.021624 | 0.023108 | 0.058070 | 0.140713 | 0.015228 | 0.038811 | 0.003727 |
| Volatility | 1.046350 | 2.134923 | 0.810944 | 1.373624 | 4.394597 | 0.405993 | 0.819835 | 1.240047 |
| Sharpe_ratio | 0.998498 | 0.191161 | 0.454780 | 0.679187 | 0.661472 | 0.709339 | 0.767571 | 0.050270 |
| Volume | 3748.270884 | 4303.949826 | 3799.302550 | 4075.687270 | 4041.980287 | 3298.725014 | 3742.450428 | 3982.060992 |
| Intraday_Ch_Derv | 5.113953 | 6.846125 | -2.390242 | 5.708185 | 9.900266 | -6.054209 | 0.684388 | -6.968046 |
| KMeans_Labels | 4.111111 | 6.000000 | 1.419355 | 1.785714 | 4.000000 | 3.700000 | 4.037037 | 7.166667 |
| PCA_KMeans_Labels | 1.666667 | 0.400000 | 1.000000 | 1.857143 | 2.000000 | 1.000000 | 1.000000 | 0.000000 |
| UMAP_KMeans_Labels | 2.666667 | 2.400000 | 2.290323 | 0.928571 | 2.000000 | 4.900000 | 3.740741 | 2.833333 |
fig, axs = plt.subplots(ncols=2, nrows=1, figsize=(16,28))
plt.subplots_adjust(right=2)
plt.subplots_adjust(top=2)
for i, feature in enumerate(og_stock_profile_df.columns.to_list(), 1):
plt.subplot(len(stock_profile_df.columns), 2, i)
sns.boxplot(x='LLE_KMeans_Labels', y=feature, data=stock_profile_df, palette='husl')
plt.title(f'{feature}', size=15, fontsize=12)
plt.tight_layout()
plt.show()
Lle_max_sharpe_cluster_pf = stock_profile_df[stock_profile_df.LLE_KMeans_Labels == 0].index.to_list()
print(Lle_max_sharpe_cluster_pf)
cov = returns[Lle_max_sharpe_cluster_pf].cov()
lw_cov = LedoitWolf().fit(cov)
rbust_cov = lw_cov.covariance_
['ADBE', 'ASML', 'EA', 'FTNT', 'MSFT', 'TSM', 'TTWO', 'UNH', 'WCN']
sns.clustermap(pd.DataFrame(rbust_cov, index=cov.index, columns=cov.columns), figsize=(12,8), row_cluster=False)
plt.show()
n = rbust_cov.shape[1]
pca = PCA(n_components=n, random_state=1)
data_pca = pd.DataFrame(pca.fit_transform(rbust_cov))
exp_var = pca.explained_variance_ratio_
# visualize the Explained Individual Components
plt.figure(figsize = (13,6))
plt.plot(range(1,n+1), pca.explained_variance_ratio_.cumsum(), marker = 'o', linestyle = '--', color='blue')
plt.plot(range(1,n+1), pca.explained_variance_ratio_, marker = 'o', color='red')
plt.title("Cumulative & Explained Variances by Components", fontsize=14)
plt.xlabel("Number of Components", fontsize=14)
plt.ylabel("Variance", fontsize=14)
plt.legend(['Cumulative', 'Explained']);
topPCs = pd.DataFrame(pca.components_[:9], columns=cov.columns)
eigen_portfolios = topPCs.subtract(topPCs.min()).div(topPCs.max().subtract(topPCs.min()))
eigen_portfolios = eigen_portfolios.div(eigen_portfolios.sum(1), axis=0)
# eigen_portfolios = topPCs.div(topPCs.sum(1), axis=0)
eigen_portfolios.index = [f'Portfolio {i}' for i in range(1, topPCs.shape[0]+1)]
eigen_portfolios.T.plot.bar(subplots=True, figsize=(15,6), legend=False, sharex=True, layout=(3,3))
plt.tight_layout();
fig, axs = plt.subplots(figsize=(16,30))
n_pfs = topPCs.shape[0]+1
for i, c in zip(range(n_pfs), ['k', 'b', 'orangered', 'g', 'r', 'purple', 'brown', 'hotpink', 'grey', 'y']):
plt.subplot(n_pfs, 3, i+1)
if i == 0:
tmp = returns.SPY.cumsum()
sr = (returns.SPY.mean() / returns.SPY.std()) * np.sqrt(252)
tmp.plot(title=f'The Market \nSharpe Ratio: {round(sr,3)}', color=c)
else:
tmp = returns[Lle_max_sharpe_cluster_pf].mul(eigen_portfolios.iloc[i-1]).sum(1)
sr = (tmp.mean() / tmp.std()) * np.sqrt(252)
tmp.cumsum().plot(color=c, title=f'Portfolio {i} \nSharpe Ratio: {round(sr,3)}')
plt.tight_layout()
plt.show()
kmeans = KMeans(n_clusters=tsne_k, random_state=1)
kmeans.fit(X_tsne)
stock_profile_df['TSNE_KMeans_Labels'] = kmeans.predict(X_tsne)
labels = kmeans.predict(X_tsne)
colors = cm.rainbow(np.linspace(0, 1, stock_profile_df['TSNE_KMeans_Labels'].nunique()))
stock_profile_df.TSNE_KMeans_Labels.value_counts().plot(kind='barh', color=colors)
plt.title('Count of Unique Cluster Members')
plt.xlabel('Member Count')
plt.ylabel('Cluster ID');
plt.figure(1, facecolor='white', figsize=(10, 6))
plt.clf()
plt.axis('off')
scatter0 = plt.scatter(X_tsne[:,0], X_tsne[:,1], s=200, alpha=0.8,
c=labels[labels!=-1], cmap='rainbow', edgecolors='k', lw=0.75)
plt.legend(*scatter0.legend_elements(), title="Cluster", bbox_to_anchor=(1.0, 0.99))
plt.title('T-SNE of all stocks with KMedoids clusters noted');
df_kmeans = stock_profile_df.groupby('TSNE_KMeans_Labels').mean()
df_kmeans.T
| TSNE_KMeans_Labels | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
|---|---|---|---|---|---|---|---|---|
| Returns | 0.035872 | 0.012130 | 0.028891 | 0.011117 | 0.021429 | 0.039263 | 0.044856 | 0.093825 |
| Volatility | 0.892890 | 0.830915 | 1.106005 | 1.517813 | 0.661040 | 0.723302 | 0.953881 | 2.321256 |
| Sharpe_ratio | 0.638547 | 0.345871 | 0.416858 | 0.115258 | 0.553595 | 0.890288 | 0.768157 | 0.774091 |
| Volume | 4104.798950 | 3312.688664 | 3126.107977 | 4124.309902 | 3836.330037 | 3551.634769 | 3835.434780 | 3956.403331 |
| Intraday_Ch_Derv | 8.970677 | -7.150025 | 9.274703 | -4.182887 | -5.980373 | 2.841130 | -9.912272 | 1.453674 |
| KMeans_Labels | 1.480000 | 4.000000 | 0.833333 | 7.714286 | 1.684211 | 5.928571 | 3.222222 | 3.285714 |
| PCA_KMeans_Labels | 1.280000 | 0.818182 | 1.166667 | 0.000000 | 1.000000 | 1.071429 | 1.000000 | 2.000000 |
| UMAP_KMeans_Labels | 0.080000 | 3.727273 | 1.000000 | 3.000000 | 3.578947 | 5.071429 | 6.000000 | 2.000000 |
| LLE_KMeans_Labels | 3.440000 | 4.000000 | 3.833333 | 5.285714 | 2.894737 | 4.214286 | 5.111111 | 2.428571 |
fig, axs = plt.subplots(ncols=2, nrows=1, figsize=(16,28))
plt.subplots_adjust(right=2)
plt.subplots_adjust(top=2)
for i, feature in enumerate(og_stock_profile_df.columns.to_list(), 1):
plt.subplot(len(stock_profile_df.columns), 2, i)
sns.boxplot(x='TSNE_KMeans_Labels', y=feature, data=stock_profile_df, palette='husl')
plt.title(f'{feature}', size=15, fontsize=12)
plt.tight_layout()
plt.show()
TSNE_max_sharpe_cluster_pf = stock_profile_df[stock_profile_df.TSNE_KMeans_Labels == 5].index.to_list()
print(TSNE_max_sharpe_cluster_pf)
cov = returns[TSNE_max_sharpe_cluster_pf].cov()
lw_cov = LedoitWolf().fit(cov)
rbust_cov = lw_cov.covariance_
['ADBE', 'ANTM', 'ASML', 'EFX', 'GOOGL', 'LHX', 'MCO', 'NOC', 'ORLY', 'SHY', 'TMO', 'UNH', 'VOO', 'WCN']
sns.clustermap(pd.DataFrame(rbust_cov, index=cov.index, columns=cov.columns), figsize=(12,8), row_cluster=False)
plt.show()
n = rbust_cov.shape[1]
pca = PCA(n_components=n, random_state=1)
data_pca = pd.DataFrame(pca.fit_transform(rbust_cov))
exp_var = pca.explained_variance_ratio_
# visualize the Explained Individual Components
plt.figure(figsize = (13,6))
plt.plot(range(1,n+1), pca.explained_variance_ratio_.cumsum(), marker = 'o', linestyle = '--', color='blue')
plt.plot(range(1,n+1), pca.explained_variance_ratio_, marker = 'o', color='red')
plt.title("Cumulative & Explained Variances by Components", fontsize=14)
plt.xlabel("Number of Components", fontsize=14)
plt.ylabel("Variance", fontsize=14)
plt.legend(['Cumulative', 'Explained']);
topPCs = pd.DataFrame(pca.components_[:9], columns=cov.columns)
eigen_portfolios = topPCs.subtract(topPCs.min()).div(topPCs.max().subtract(topPCs.min()))
eigen_portfolios = eigen_portfolios.div(eigen_portfolios.sum(1), axis=0)
# eigen_portfolios = topPCs.div(topPCs.sum(1), axis=0)
eigen_portfolios.index = [f'Portfolio {i}' for i in range(1, topPCs.shape[0]+1)]
eigen_portfolios.T.plot.bar(subplots=True, figsize=(15,5), legend=False, sharex=True, layout=(3,3))
plt.tight_layout();
fig, axs = plt.subplots(figsize=(16,30))
n_pfs = topPCs.shape[0]+1
for i, c in zip(range(n_pfs), ['k', 'b', 'orangered', 'g', 'r', 'purple', 'brown', 'hotpink', 'grey']):
plt.subplot(n_pfs, 3, i+1)
if i == 0:
tmp = returns.SPY.cumsum()
sr = (returns.SPY.mean() / returns.SPY.std()) * np.sqrt(252)
tmp.plot(title=f'The Market \nSharpe Ratio: {round(sr,3)}', color=c)
else:
tmp = returns[TSNE_max_sharpe_cluster_pf].mul(eigen_portfolios.iloc[i-1]).sum(1)
STD = tmp.std() if tmp.std() > 0 else 1
sr = (tmp.mean() / STD) * np.sqrt(252)
tmp.cumsum().plot(color=c, title=f'Portfolio {i} \nSharpe Ratio: {round(sr,3)}')
plt.tight_layout()
plt.show()
# List of all linkage methods to check
methods = ['single', 'complete', 'average', 'ward']
pairwise_distance = pdist(X_tsne)
fig, axs = plt.subplots(len(methods), 1, figsize=(16, 18))
for i, method in enumerate(methods):
Z = linkage(X_tsne, metric='euclidean', method=method)
c, coph_dists = cophenet(Z, pairwise_distance)
dendrogram(Z, leaf_rotation=90, leaf_font_size=9, labels=stock_profile_df.index, ax=axs[i]);
axs[i].set_title(f'Method: {method.capitalize()} | Pairwise Distance Correlation: {round(c,4)}')
axs[i].set_ylabel('Distance')
plt.tight_layout();
plt.figure(figsize=(20, 4))
plt.title("Dendrograms")
dend = dendrogram(linkage(X_tsne, method='ward'), leaf_font_size=9, leaf_rotation=90, labels=stock_profile_df.index)
plt.axhline(y=500, color='red', linestyle='--');
hierarchical = AgglomerativeClustering(n_clusters=6, affinity='euclidean', linkage='single')
hierarchical.fit(X_tsne)
stock_profile_df['TSNE_HCLabels'] = hierarchical.labels_
labels = hierarchical.labels_
colors = cm.rainbow(np.linspace(0, 1, stock_profile_df['TSNE_HCLabels'].nunique()))
stock_profile_df.TSNE_HCLabels.value_counts().plot(kind='barh', color=colors)
plt.title('Count of Unique Cluster Members')
plt.xlabel('Member Count')
plt.ylabel('Cluster ID');
plt.figure(1, facecolor='white', figsize=(10, 6))
plt.clf()
plt.axis('off')
scatter0 = plt.scatter(X_tsne[:,0], X_tsne[:,1], s=200, alpha=0.8,
c=labels[labels!=-1], cmap='rainbow', edgecolors='k', lw=0.75)
plt.legend(*scatter0.legend_elements(), title="Cluster", bbox_to_anchor=(1.1, 0.99))
plt.title('T-SNE of all stocks with Hierarchical clusters noted');
df_hc = stock_profile_df.groupby('TSNE_HCLabels').mean()
df_hc.T
| TSNE_HCLabels | 0 | 1 | 2 | 3 | 4 | 5 |
|---|---|---|---|---|---|---|
| Returns | 0.028798 | 0.093825 | 0.011117 | 0.013605 | 0.031949 | 0.044856 |
| Volatility | 0.784758 | 2.321256 | 1.517813 | 0.645083 | 1.198190 | 0.953881 |
| Sharpe_ratio | 0.619574 | 0.774091 | 0.115258 | 0.334806 | 0.433269 | 0.768157 |
| Volume | 3792.358065 | 3956.403331 | 4124.309902 | 3188.549734 | 3113.619626 | 3835.434780 |
| Intraday_Ch_Derv | 1.040078 | 1.453674 | -4.182887 | 9.117445 | 9.306154 | -9.912272 |
| KMeans_Labels | 2.840580 | 3.285714 | 7.714286 | 0.000000 | 1.000000 | 3.222222 |
| PCA_KMeans_Labels | 1.086957 | 2.000000 | 0.000000 | 1.000000 | 1.200000 | 1.000000 |
| UMAP_KMeans_Labels | 2.637681 | 2.000000 | 3.000000 | 1.000000 | 1.000000 | 6.000000 |
| LLE_KMeans_Labels | 3.536232 | 2.428571 | 5.285714 | 2.000000 | 4.200000 | 5.111111 |
| TSNE_KMeans_Labels | 2.275362 | 7.000000 | 3.000000 | 2.000000 | 2.000000 | 6.000000 |
fig, axs = plt.subplots(ncols=2, nrows=1, figsize=(16,28))
plt.subplots_adjust(right=2)
plt.subplots_adjust(top=2)
for i, feature in enumerate(og_stock_profile_df.columns.to_list(), 1):
plt.subplot(len(stock_profile_df.columns), 2, i)
sns.boxplot(x='TSNE_HCLabels', y=feature, data=stock_profile_df, palette='husl')
plt.title(f'{feature}', size=15, fontsize=12)
plt.tight_layout()
plt.show()
TSNE_HC_max_sharpe_cluster_pf = stock_profile_df[stock_profile_df.TSNE_HCLabels == 1].index.to_list()
print(TSNE_HC_max_sharpe_cluster_pf)
cov = returns[TSNE_HC_max_sharpe_cluster_pf].cov()
lw_cov = LedoitWolf().fit(cov)
rbust_cov = lw_cov.covariance_
['AMAT', 'AMC', 'AMD', 'ATVI', 'BX', 'EA', 'FTNT', 'GME', 'NFLX', 'NVDA', 'TSLA', 'TSM', 'TTWO', 'UPRO']
sns.clustermap(pd.DataFrame(rbust_cov, index=cov.index, columns=cov.columns), figsize=(12,8), row_cluster=False)
plt.show()
n = rbust_cov.shape[1]
pca = PCA(n_components=n, random_state=1)
data_pca = pd.DataFrame(pca.fit_transform(rbust_cov))
exp_var = pca.explained_variance_ratio_
# visualize the Explained Individual Components
plt.figure(figsize = (13,6))
plt.plot(range(1,n+1), pca.explained_variance_ratio_.cumsum(), marker = 'o', linestyle = '--', color='blue')
plt.plot(range(1,n+1), pca.explained_variance_ratio_, marker = 'o', color='red')
plt.title("Cumulative & Explained Variances by Components", fontsize=14)
plt.xlabel("Number of Components", fontsize=14)
plt.ylabel("Variance", fontsize=14)
plt.legend(['Cumulative', 'Explained']);
topPCs = pd.DataFrame(pca.components_[:9], columns=cov.columns)
eigen_portfolios = topPCs.subtract(topPCs.min()).div(topPCs.max().subtract(topPCs.min()))
eigen_portfolios = eigen_portfolios.div(eigen_portfolios.sum(1), axis=0)
# eigen_portfolios = topPCs.div(topPCs.sum(1), axis=0)
eigen_portfolios.index = [f'Portfolio {i}' for i in range(1, topPCs.shape[0]+1)]
eigen_portfolios.T.plot.bar(subplots=True, figsize=(15,5), legend=False, sharex=True, layout=(3,3))
plt.tight_layout();
fig, axs = plt.subplots(figsize=(16,30))
n_pfs = topPCs.shape[0]+1
for i, c in zip(range(n_pfs), ['k', 'b', 'orangered', 'g', 'r', 'purple', 'brown', 'hotpink', 'grey']):
plt.subplot(n_pfs, 3, i+1)
if i == 0:
tmp = returns.SPY.cumsum()
sr = (returns.SPY.mean() / returns.SPY.std()) * np.sqrt(252)
tmp.plot(title=f'The Market \nSharpe Ratio: {round(sr,3)}', color=c)
else:
tmp = returns[TSNE_HC_max_sharpe_cluster_pf].mul(eigen_portfolios.iloc[i-1]).sum(1)
sr = (tmp.mean() / tmp.std()) * np.sqrt(252)
tmp.cumsum().plot(color=c, title=f'Portfolio {i} \nSharpe Ratio: {round(sr,3)}')
plt.tight_layout()
plt.show()
# List of all linkage methods to check
methods = ['single', 'complete', 'average', 'ward']
pairwise_distance = pdist(X_umap)
fig, axs = plt.subplots(len(methods), 1, figsize=(16, 18))
for i, method in enumerate(methods):
Z = linkage(X_umap, metric='euclidean', method=method)
c, coph_dists = cophenet(Z, pairwise_distance)
dendrogram(Z, leaf_rotation=90, leaf_font_size=12, labels=stock_profile_df.index, ax=axs[i]);
axs[i].set_title(f'Method: {method.capitalize()} | Pairwise Distance Correlation: {round(c,4)}')
axs[i].set_ylabel('Distance')
plt.tight_layout();
plt.figure(figsize=(16, 4))
plt.title("Dendrograms")
dend = dendrogram(linkage(X_umap, method='single'), leaf_font_size=12, leaf_rotation=90, labels=stock_profile_df.index)
plt.axhline(y=1, color='red', linestyle='--');
hierarchical = AgglomerativeClustering(n_clusters=4, affinity='euclidean', linkage='single')
hierarchical.fit(X_umap)
stock_profile_df['UMAP_HCLabels'] = hierarchical.labels_
labels = hierarchical.labels_
colors = cm.rainbow(np.linspace(0, 1, stock_profile_df['UMAP_HCLabels'].nunique()))
stock_profile_df.UMAP_HCLabels.value_counts().plot(kind='barh', color=colors)
plt.title('Count of Unique Cluster Members')
plt.xlabel('Member Count')
plt.ylabel('Cluster ID');
plt.figure(1, facecolor='white', figsize=(10, 6))
plt.clf()
plt.axis('off')
scatter0 = plt.scatter(X_umap[:,0], X_umap[:,1], s=200, alpha=0.8,
c=labels[labels!=-1], cmap='rainbow', edgecolors='k', lw=0.75)
plt.legend(*scatter0.legend_elements(), title="Cluster", bbox_to_anchor=(1.125, 0.99))
plt.title('UMAP of all stocks with Hierarchical clusters noted');
df_hc = stock_profile_df.groupby('UMAP_HCLabels').mean()
df_hc.T
| UMAP_HCLabels | 0 | 1 | 2 | 3 |
|---|---|---|---|---|
| Returns | 0.030514 | 0.093825 | 0.038789 | 0.007510 |
| Volatility | 0.836144 | 2.321256 | 0.659630 | 1.520488 |
| Sharpe_ratio | 0.593810 | 0.774091 | 0.962368 | 0.078366 |
| Volume | 3767.096346 | 3956.403331 | 3602.547543 | 4100.036570 |
| Intraday_Ch_Derv | -0.318778 | 1.453674 | 6.670867 | -3.787034 |
| KMeans_Labels | 2.283784 | 3.285714 | 6.000000 | 7.666667 |
| PCA_KMeans_Labels | 1.094595 | 2.000000 | 1.111111 | 0.000000 |
| UMAP_KMeans_Labels | 2.743243 | 2.000000 | 4.000000 | 3.000000 |
| LLE_KMeans_Labels | 3.743243 | 2.428571 | 3.222222 | 5.400000 |
| TSNE_KMeans_Labels | 2.391892 | 7.000000 | 5.000000 | 2.866667 |
| TSNE_HCLabels | 0.918919 | 1.000000 | 0.000000 | 1.866667 |
fig, axs = plt.subplots(ncols=2, nrows=1, figsize=(16,28))
plt.subplots_adjust(right=2)
plt.subplots_adjust(top=2)
for i, feature in enumerate(og_stock_profile_df.columns.to_list(), 1):
plt.subplot(len(stock_profile_df.columns), 2, i)
sns.boxplot(x='UMAP_HCLabels', y=feature, data=stock_profile_df, palette='husl')
plt.title(f'{feature}', size=15, fontsize=12)
plt.tight_layout()
plt.show()
UMAP_HC_max_sharpe_cluster_pf = stock_profile_df[stock_profile_df.UMAP_HCLabels == 2].index.to_list()
print(UMAP_HC_max_sharpe_cluster_pf)
['ADBE', 'ASML', 'GOOGL', 'NOC', 'SHY', 'TMO', 'UNH', 'VOO', 'WCN']
cov = returns[UMAP_HC_max_sharpe_cluster_pf].cov()
lw_cov = LedoitWolf().fit(cov)
rbust_cov = lw_cov.covariance_
sns.clustermap(pd.DataFrame(rbust_cov, index=cov.index, columns=cov.columns), figsize=(12,8), row_cluster=False)
plt.show()
n = rbust_cov.shape[1]
pca = PCA(n_components=n, random_state=1)
data_pca = pd.DataFrame(pca.fit_transform(rbust_cov))
exp_var = pca.explained_variance_ratio_
# visualize the Explained Individual Components
plt.figure(figsize = (13,6))
plt.plot(range(1,n+1), pca.explained_variance_ratio_.cumsum(), marker = 'o', linestyle = '--', color='blue')
plt.plot(range(1,n+1), pca.explained_variance_ratio_, marker = 'o', color='red')
plt.title("Cumulative & Explained Variances by Components", fontsize=14)
plt.xlabel("Number of Components", fontsize=14)
plt.ylabel("Variance", fontsize=14)
plt.legend(['Cumulative', 'Explained']);
topPCs = pd.DataFrame(pca.components_[:9], columns=cov.columns)
eigen_portfolios = topPCs.subtract(topPCs.min()).div(topPCs.max().subtract(topPCs.min()))
eigen_portfolios = eigen_portfolios.div(eigen_portfolios.sum(1), axis=0)
# eigen_portfolios = topPCs.div(topPCs.sum(1), axis=0)
eigen_portfolios.index = [f'Portfolio {i}' for i in range(1, topPCs.shape[0]+1)]
eigen_portfolios.T.plot.bar(subplots=True, figsize=(12,6), legend=False, sharex=True, layout=(3,3))
plt.tight_layout();
fig, axs = plt.subplots(figsize=(16,30))
n_pfs = topPCs.shape[0]+1
for i, c in zip(range(n_pfs), ['k', 'b', 'orangered', 'g', 'r', 'purple', 'brown', 'hotpink', 'grey']):
plt.subplot(n_pfs, 3, i+1)
if i == 0:
tmp = returns.SPY.cumsum()
sr = (returns.SPY.mean() / returns.SPY.std()) * np.sqrt(252)
tmp.plot(title=f'The Market \nSharpe Ratio: {round(sr,3)}', color=c)
else:
tmp = returns[UMAP_HC_max_sharpe_cluster_pf].mul(eigen_portfolios.iloc[i-1]).sum(1)
sr = (tmp.mean() / tmp.std()) * np.sqrt(252)
tmp.cumsum().plot(color=c, title=f'Portfolio {i} \nSharpe Ratio: {round(sr,3)}')
plt.tight_layout()
plt.show()
def plot_dbscan_eps(X: np.array, esp_: list, min_samples: list, fig_s: list =[17,80], tsne: bool = False) -> None:
i = 1
if tsne:
X_plt = TSNE(n_components=2, perplexity=50, n_iter=1000, n_jobs=-1).fit_transform(X)
else:
X_plt = X.copy()
for s in min_samples:
fig = plt.figure(figsize=(fig_s[0], fig_s[1]))
for i, x in enumerate(esp_):
db = DBSCAN(eps=x, min_samples=s, n_jobs=-1)
labels = db.fit_predict(X)
ax = fig.add_subplot(len(esp_), 3, i+1)
scatter0 = ax.scatter(X_plt[(labels!=-1), 0], X_plt[(labels!=-1), 1], s=100, alpha=0.7, c=labels[labels!=-1],
cmap='rainbow', edgecolor='k', lw=0.5)
scatter1 = ax.scatter(X_plt[(labels==-1), 0], X_plt[(labels==-1), 1], s=100, alpha=0.175)
try:
legend1 = ax.legend(*scatter0.legend_elements(), loc='best', title="Cluster")
ax.add_artist(legend1);
except:
pass
ax.set_title(f'eps = {round(x,4)}', fontsize=14)
i += 1
fig.suptitle(f'Min Samples: {s}', fontsize=16)
plt.tight_layout()
fig.subplots_adjust(top=0.97)
plt.show()
esp_ = np.linspace(0.2, 0.4, 21)
min_samples = [2, 3, 5]
plot_dbscan_eps(
X=X_pca,
esp_=esp_,
min_samples=min_samples,
fig_s=[16,65],
tsne=True
)
dbscan = DBSCAN(eps=0.3, min_samples=5, n_jobs=-1)
stock_profile_df['PCA_DBSCAN_Labels'] = dbscan.fit_predict(X_pca)
labels = dbscan.fit_predict(X_pca)
colors = cm.rainbow(np.linspace(0, 1, stock_profile_df['PCA_DBSCAN_Labels'].nunique()))
stock_profile_df.UMAP_HCLabels.value_counts().plot(kind='barh', color=colors)
plt.title('Count of Unique Cluster Members')
plt.xlabel('Member Count')
plt.ylabel('Cluster ID');
X_plt = TSNE(n_components=2, perplexity=50, n_iter=1000, n_jobs=-1).fit_transform(X_pca)
clustered_series_all = pd.Series(index=X.index, data=dbscan.labels_.flatten())
plt.figure(1, facecolor='white', figsize=(10, 6))
plt.clf()
plt.axis('off')
scatter0 = plt.scatter(X_plt[(labels!=-1), 0], X_plt[(labels!=-1), 1], s=200, alpha=0.75,
c=labels[labels!=-1], cmap='rainbow', edgecolors='k', lw=0.75)
scatter1 = plt.scatter(X_plt[(clustered_series_all==-1).values, 0], X_plt[(clustered_series_all==-1).values, 1],
s=200, alpha=0.175)
plt.legend(*scatter0.legend_elements(), loc='best', title="Cluster")
plt.title('PCA of all stocks with DBSCAN clusters noted');
df_hc = stock_profile_df.groupby('PCA_DBSCAN_Labels').mean()
df_hc.T
| PCA_DBSCAN_Labels | -1 | 0 | 1 | 2 | 3 | 4 |
|---|---|---|---|---|---|---|
| Returns | 0.036777 | 0.042860 | 0.037372 | 0.026542 | 0.064361 | 0.017382 |
| Volatility | 1.298578 | 0.897524 | 0.745029 | 0.884660 | 1.330341 | 0.594346 |
| Sharpe_ratio | 0.521018 | 0.760543 | 0.798086 | 0.476719 | 0.769321 | 0.470150 |
| Volume | 3880.214482 | 3915.217521 | 3643.143904 | 3761.065404 | 4042.292820 | 3501.125826 |
| Intraday_Ch_Derv | 0.966791 | 1.681730 | -2.020971 | -1.890481 | 1.399124 | -3.017255 |
| KMeans_Labels | 3.901639 | 3.625000 | 5.133333 | 1.000000 | 2.600000 | 1.800000 |
| PCA_KMeans_Labels | 1.032787 | 1.000000 | 1.000000 | 1.000000 | 2.000000 | 1.000000 |
| UMAP_KMeans_Labels | 2.393443 | 2.250000 | 5.466667 | 2.500000 | 1.600000 | 2.600000 |
| LLE_KMeans_Labels | 3.459016 | 6.000000 | 6.000000 | 2.444444 | 3.000000 | 2.600000 |
| TSNE_KMeans_Labels | 2.868852 | 2.250000 | 5.133333 | 2.888889 | 5.600000 | 2.600000 |
| TSNE_HCLabels | 0.819672 | 1.875000 | 1.333333 | 0.777778 | 0.800000 | 1.400000 |
| UMAP_HCLabels | 1.065574 | 0.000000 | 0.533333 | 0.000000 | 0.800000 | 0.000000 |
fig, axs = plt.subplots(ncols=2, nrows=1, figsize=(16,28))
plt.subplots_adjust(right=2)
plt.subplots_adjust(top=2)
for i, feature in enumerate(og_stock_profile_df.columns.to_list(), 1):
plt.subplot(len(stock_profile_df.columns), 2, i)
sns.boxplot(x='PCA_DBSCAN_Labels', y=feature, data=stock_profile_df, palette='husl')
plt.title(f'{feature}', size=15, fontsize=12)
plt.tight_layout()
plt.show()
PCA_DBSCAN_max_sharpe_cluster_pf = stock_profile_df[stock_profile_df.UMAP_HCLabels == 1].index.to_list()
print(PCA_DBSCAN_max_sharpe_cluster_pf)
['AMAT', 'AMC', 'AMD', 'ATVI', 'BX', 'EA', 'FTNT', 'GME', 'NFLX', 'NVDA', 'TSLA', 'TSM', 'TTWO', 'UPRO']
cov = returns[PCA_DBSCAN_max_sharpe_cluster_pf].cov()
lw_cov = LedoitWolf().fit(cov)
rbust_cov = lw_cov.covariance_
sns.clustermap(pd.DataFrame(rbust_cov, index=cov.index, columns=cov.columns), figsize=(12,8), row_cluster=False)
plt.show()
n = rbust_cov.shape[1]
pca = PCA(n_components=n, random_state=1)
data_pca = pd.DataFrame(pca.fit_transform(rbust_cov))
exp_var = pca.explained_variance_ratio_
# visualize the Explained Individual Components
plt.figure(figsize = (13,6))
plt.plot(range(1,n+1), pca.explained_variance_ratio_.cumsum(), marker = 'o', linestyle = '--', color='blue')
plt.plot(range(1,n+1), pca.explained_variance_ratio_, marker = 'o', color='red')
plt.title("Cumulative & Explained Variances by Components", fontsize=14)
plt.xlabel("Number of Components", fontsize=14)
plt.ylabel("Variance", fontsize=14)
plt.legend(['Cumulative', 'Explained']);
topPCs = pd.DataFrame(pca.components_[:9], columns=cov.columns)
eigen_portfolios = topPCs.subtract(topPCs.min()).div(topPCs.max().subtract(topPCs.min()))
eigen_portfolios = eigen_portfolios.div(eigen_portfolios.sum(1), axis=0)
# eigen_portfolios = topPCs.div(topPCs.sum(1), axis=0)
eigen_portfolios.index = [f'Portfolio {i}' for i in range(1, topPCs.shape[0]+1)]
eigen_portfolios.T.plot.bar(subplots=True, figsize=(12,6), legend=False, sharex=True, layout=(3,3))
plt.tight_layout();
fig, axs = plt.subplots(figsize=(16,30))
n_pfs = topPCs.shape[0]+1
for i, c in zip(range(n_pfs), ['k', 'b', 'orangered', 'g', 'r', 'purple', 'brown', 'hotpink', 'grey']):
plt.subplot(n_pfs, 3, i+1)
if i == 0:
tmp = returns.SPY.cumsum()
sr = (returns.SPY.mean() / returns.SPY.std()) * np.sqrt(252)
tmp.plot(title=f'The Market \nSharpe Ratio: {round(sr,3)}', color=c)
else:
tmp = returns[PCA_DBSCAN_max_sharpe_cluster_pf].mul(eigen_portfolios.iloc[i-1]).sum(1)
sr = (tmp.mean() / tmp.std()) * np.sqrt(252)
tmp.cumsum().plot(color=c, title=f'Portfolio {i} \nSharpe Ratio: {round(sr,3)}')
plt.tight_layout()
plt.show()
class BlockingTimeSeriesSplit():
def __init__(self, n_assets=2, n_splits=5, margin=5, n_train=1000, test_size=1, roll=True,
hold=5, multi_idx=False):
self.n_splits = n_splits
self.roll = roll
if multi_idx:
self.margin = margin * n_assets
self.hold = hold * n_assets
self.test_size = test_size * n_assets
self.n_train = n_train * n_assets
else:
self.margin = margin
self.hold = hold
self.test_size = test_size
self.n_train = n_train
def split(self, X, y=None, groups=None):
n_samples = len(X)
indices = np.arange(n_samples)
for i in range(self.n_train, n_samples, self.hold):
if self.roll:
train = indices[i - self.n_train : i]
test = indices[i + self.margin : i + self.test_size + self.margin]
else:
train = indices[0 : i]
test = indices[i + self.margin : i + self.test_size + self.margin]
if len(test) == self.test_size:
yield (train, test)
else:
break
def get_n_splits(self, X, y, groups):
n_samples = len(X)
indices = np.arange(n_samples)
CNT = 0
for i in range(self.n_train, n_samples, self.hold):
if self.roll:
train = indices[i - self.n_train : i]
test = indices[i + self.margin : i + self.test_size + self.margin]
else:
train = indices[0 : i]
test = indices[i + self.margin : i + self.test_size + self.margin]
if len(test) == self.test_size:
CNT += 1
else:
break
return int(CNT)
cv = BlockingTimeSeriesSplit(n_assets=len(closes.columns), n_train=365, test_size=63, margin=0,
hold=63, roll=True, multi_idx=True)
from sklearn_extra.cluster import KMedoids
n_clusters = list(range(4, 8))
Results, silhouette_avgs = [], []
for train_idx, test_idx in cv.split(ohlcv_df, ohlcv_df):
Train = ohlcv_df.iloc[train_idx]; Test = ohlcv_df.iloc[test_idx]
mrg = ohlcv_df.iloc[train_idx[-1] + 1:test_idx[0]].iloc[:, :3]
print()
print(f'\nTraining on {int(len(Train))} records from {Train.index[0][0]} - {Train.index[0][-1]}')
print(f'\nPredicting on {int(len(Test))} record from {Test.index[0][0]} - {Test.index[0][-1]} with {int(len(mrg))} days margin')
if len(mrg) > 0:
#print(f'\nMargin Dates: \n{mrg}')
print(f'\nMargin Dates: {mrg.index[-1][0]}')
print()
#print('\nTest:', f'\n{Test.iloc[:, :3]}')
_X = (Train.close.unstack().pct_change().mean()).to_frame('Returns')
_X['Volatility'] = Train.close.unstack().pct_change().std()
_X['Sharpe_ratio'] = (_X['Returns'] / _X['Volatility']) * np.sqrt(252)
_X['Volume'] = Train.volume.unstack().mean()
_X['Intraday_Ch_Derv'] = Train.CLS_INTD_CH_DERV.unstack().mean()
_X.Volatility = _X.Volatility.apply(np.log1p)
_X = _X[~_X.isin([np.nan, np.inf, -np.inf]).any(1)]
_X_copy = _X.copy()
_X = pd.DataFrame(
StandardScaler().fit_transform(_X),
index=_X.index,
columns=_X.columns)
'''lle_res_df = find_lle_k(
X=_X,
n_comp=3,
neighbors=lle_n_neighbors,
n_clusters=n_clusters,
rs=1,
n_jobs=-1,
V=0)'''
lle_res_df = find_pca_k(
X=_X,
n_comp=2,
n_clusters=n_clusters,
rs=1,
n_jobs=-1,
V=0
)
#N_Neighbors = lle_res_df.head(1)['N_Neighbors'].values[0].astype(int)
K = lle_res_df.head(1)['K'].values[0].astype(int)
print(f'N_Neighbors: {N_Neighbors}')
X_lle = PCA(n_components=2).fit_transform(_X.values)
'''X_lle, _ = locally_linear_embedding(
_X.values,
n_components=3,
n_neighbors=N_Neighbors,
method='standard',
n_jobs=-1)'''
kmeans = KMeans(n_clusters=K, random_state=1)
# kmeans = KMedoids(n_clusters=K, random_state=1)
kmeans.fit(X_lle)
_X_copy['LLE_KMeans_Labels'] = kmeans.predict(X_lle)
labels = kmeans.predict(X_lle)
silhouette_avg = silhouette_score(X_lle, labels)
silhouette_avgs.append(silhouette_avg)
colors = cm.rainbow(np.linspace(0, 1, _X_copy['LLE_KMeans_Labels'].nunique()))
_X_copy.LLE_KMeans_Labels.value_counts().plot(kind='barh', color=colors)
plt.title('Count of Unique Cluster Members')
plt.xlabel('Member Count')
plt.ylabel('Cluster ID');
plt.show()
plt.figure(1, facecolor='white', figsize=(10, 6))
plt.clf()
plt.axis('off')
scatter0 = plt.scatter(X_lle[:,0], X_lle[:,1], s=200, alpha=0.8,
c=labels[labels!=-1], cmap='rainbow', edgecolors='k', lw=0.75)
plt.legend(*scatter0.legend_elements(), title="Cluster", bbox_to_anchor=(1.125, 0.99))
plt.title(f'LLE of all stocks with Kmeans clusters noted \nSilhouette Avg: {silhouette_avg}')
plt.show()
df_kmeans = _X_copy.groupby('LLE_KMeans_Labels').mean()[['Sharpe_ratio']].T
max_sharpe_cluster = df_kmeans.values.argmax()
print(f'Cluster Profile: \n{df_kmeans}')
print(f'Max Sharpe Cluster: {max_sharpe_cluster}')
LLE_Kmeans_max_sharpe_cluster_pf = _X_copy[_X_copy.LLE_KMeans_Labels == max_sharpe_cluster].index.to_list()
print(f'Max Sharpe Portfolio: \n{LLE_Kmeans_max_sharpe_cluster_pf}')
X_returns = Train.close.unstack().pct_change().dropna()
cov = X_returns[LLE_Kmeans_max_sharpe_cluster_pf].cov()
lw_cov = LedoitWolf().fit(cov)
# lw_cov = ShrunkCovariance(shrinkage=0.35).fit(cov)
rbust_cov = lw_cov.covariance_
n = rbust_cov.shape[1]
pca = PCA(n_components=n, random_state=1)
data_pca = pd.DataFrame(pca.fit_transform(rbust_cov))
if len(LLE_Kmeans_max_sharpe_cluster_pf) >= 8:
n_pf = 8
else:
n_pf = len(LLE_Kmeans_max_sharpe_cluster_pf)
# n_pf = len(LLE_Kmeans_max_sharpe_cluster_pf)
topPCs = pd.DataFrame(pca.components_[:n], columns=cov.columns)
eigen_portfolios = topPCs.subtract(topPCs.min()).div(topPCs.max().subtract(topPCs.min()))
eigen_portfolios = eigen_portfolios.div(eigen_portfolios.sum(1), axis=0)
eigen_portfolios.index = [f'Portfolio {i}' for i in range(1, topPCs.shape[0]+1)]
pf_wts = eigen_portfolios.loc[f'Portfolio {n_pf}']
if len(pf_wts.values) < 2:
pf_wts = pf_wts.replace(np.nan, 1.0)
eigen_portfolios = eigen_portfolios.replace(np.nan, 1.0)
eigen_portfolios.loc[f'Portfolio {n_pf}'].plot(kind='bar', figsize=(10,4), title='Max Sharpe Portfolio Weights')
plt.show()
X_test_returns = Test.close.unstack().pct_change().dropna()
rets0 = X_test_returns[LLE_Kmeans_max_sharpe_cluster_pf].mul(pf_wts.values, axis=1)
Results.append(rets0.cumsum().sum(axis=1))
Strat_Rets = pd.concat(Results,axis=0)
SR = (Strat_Rets.mean() / Strat_Rets.std() if Strat_Rets.std() > 0 else 1) * np.sqrt(252)
Strat_Rets.cumsum().plot(c='g', label=f'Sharpe_Ratio: {round(SR,3)}', figsize=(12,4))
plt.show()
print()
print('-' * 100)
Training on 40880 records from 2014-01-02 - AAPL Predicting on 7056 record from 2015-06-16 - AAPL with 0 days margin N_Neighbors: 50
Cluster Profile: LLE_KMeans_Labels 0 1 2 3 4 5 \ Sharpe_ratio 1.019082 -0.129296 1.329517 -0.759191 -0.112196 0.573332 LLE_KMeans_Labels 6 Sharpe_ratio 0.852285 Max Sharpe Cluster: 2 Max Sharpe Portfolio: ['ABT', 'AMC', 'ANTM', 'BX', 'DIS', 'EFX', 'FDX', 'FTNT', 'HD', 'HRL', 'IBB', 'IYR', 'KMX', 'MCO', 'NKE', 'NOC', 'ORLY', 'SBUX', 'TGT', 'UNH', 'URE', 'VNQ', 'WFC']
---------------------------------------------------------------------------------------------------- Training on 40880 records from 2014-04-03 - AAPL Predicting on 7056 record from 2015-09-15 - AAPL with 0 days margin N_Neighbors: 50
Cluster Profile: LLE_KMeans_Labels 0 1 2 3 4 Sharpe_ratio 0.41957 -0.87385 1.124145 -0.710622 0.007185 Max Sharpe Cluster: 2 Max Sharpe Portfolio: ['AAPL', 'ABT', 'ADBE', 'AMZN', 'ANTM', 'ATVI', 'DG', 'DIS', 'EA', 'EFX', 'FB', 'FTNT', 'HD', 'HRL', 'IBB', 'MA', 'NFLX', 'NKE', 'NOC', 'ORLY', 'QQQ', 'SBUX', 'TGT', 'TMF', 'TTWO', 'UNH']
---------------------------------------------------------------------------------------------------- Training on 40880 records from 2014-07-03 - AAPL Predicting on 7056 record from 2015-12-14 - AAPL with 0 days margin N_Neighbors: 50
Cluster Profile: LLE_KMeans_Labels 0 1 2 3 4 5 \ Sharpe_ratio -1.025024 0.381863 1.31749 0.209559 -1.005875 0.678128 LLE_KMeans_Labels 6 Sharpe_ratio -0.598822 Max Sharpe Cluster: 2 Max Sharpe Portfolio: ['AMZN', 'ATVI', 'DIS', 'EA', 'EFX', 'HD', 'HRL', 'NKE', 'NOC', 'ORLY', 'SBUX', 'TGT', 'TTWO', 'UNH', 'V', 'VLO']
---------------------------------------------------------------------------------------------------- Training on 40880 records from 2014-10-02 - AAPL Predicting on 7056 record from 2016-03-16 - AAPL with 0 days margin N_Neighbors: 50
Cluster Profile: LLE_KMeans_Labels 0 1 2 3 4 Sharpe_ratio 0.390218 0.383446 1.211617 -0.150132 -0.514767 Max Sharpe Cluster: 2 Max Sharpe Portfolio: ['AMC', 'AMZN', 'ATVI', 'AZO', 'DG', 'EA', 'EFX', 'HD', 'HRL', 'MCD', 'NKE', 'NOC', 'NVDA', 'ORLY', 'SBUX', 'SO', 'SYY', 'TGT', 'TTWO', 'UNH', 'V', 'VLO', 'WCN']
---------------------------------------------------------------------------------------------------- Training on 40880 records from 2015-01-02 - AAPL Predicting on 7056 record from 2016-06-15 - AAPL with 0 days margin N_Neighbors: 50
Cluster Profile: LLE_KMeans_Labels 0 1 2 3 Sharpe_ratio 0.3614 -0.197206 1.162922 0.991483 Max Sharpe Cluster: 2 Max Sharpe Portfolio: ['AMD', 'NVDA']
---------------------------------------------------------------------------------------------------- Training on 40880 records from 2015-04-06 - AAPL Predicting on 7056 record from 2016-09-14 - AAPL with 0 days margin N_Neighbors: 50
Cluster Profile: LLE_KMeans_Labels 0 1 2 3 4 5 \ Sharpe_ratio -0.103837 0.333356 1.997823 0.496135 0.83447 1.0277 LLE_KMeans_Labels 6 Sharpe_ratio -0.60034 Max Sharpe Cluster: 2 Max Sharpe Portfolio: ['NVDA']
---------------------------------------------------------------------------------------------------- Training on 40880 records from 2015-07-06 - AAPL Predicting on 7056 record from 2016-12-13 - AAPL with 0 days margin N_Neighbors: 50
Cluster Profile: LLE_KMeans_Labels 0 1 2 3 4 5 \ Sharpe_ratio -0.378615 1.087211 2.541109 0.289131 0.623766 0.437691 LLE_KMeans_Labels 6 Sharpe_ratio 1.34452 Max Sharpe Cluster: 2 Max Sharpe Portfolio: ['NVDA']
---------------------------------------------------------------------------------------------------- Training on 40880 records from 2015-10-02 - AAPL Predicting on 7056 record from 2017-03-16 - AAPL with 0 days margin N_Neighbors: 50
Cluster Profile: LLE_KMeans_Labels 0 1 2 3 4 5 \ Sharpe_ratio 0.523476 1.073777 0.515289 1.944778 1.290272 2.429962 LLE_KMeans_Labels 6 Sharpe_ratio -0.488779 Max Sharpe Cluster: 5 Max Sharpe Portfolio: ['AMAT', 'NVDA']
---------------------------------------------------------------------------------------------------- Training on 40880 records from 2016-01-04 - AAPL Predicting on 7056 record from 2017-06-15 - AAPL with 0 days margin N_Neighbors: 50
Cluster Profile: LLE_KMeans_Labels 0 1 2 3 Sharpe_ratio 1.020609 1.258891 1.909417 0.111071 Max Sharpe Cluster: 2 Max Sharpe Portfolio: ['AMD', 'NVDA']
---------------------------------------------------------------------------------------------------- Training on 40880 records from 2016-04-05 - AAPL Predicting on 7056 record from 2017-09-14 - AAPL with 0 days margin N_Neighbors: 50
Cluster Profile: LLE_KMeans_Labels 0 1 2 3 Sharpe_ratio 1.376658 1.005067 -0.259813 2.056895 Max Sharpe Cluster: 3 Max Sharpe Portfolio: ['AMD', 'NVDA']
---------------------------------------------------------------------------------------------------- Training on 40880 records from 2016-07-05 - AAPL Predicting on 7056 record from 2017-12-13 - AAPL with 0 days margin N_Neighbors: 50
Cluster Profile: LLE_KMeans_Labels 0 1 2 3 Sharpe_ratio 1.857997 0.531385 -0.292579 2.10746 Max Sharpe Cluster: 3 Max Sharpe Portfolio: ['AAPL', 'AMD', 'BAC', 'NVDA']
---------------------------------------------------------------------------------------------------- Training on 40880 records from 2016-10-03 - AAPL Predicting on 7056 record from 2018-03-16 - AAPL with 0 days margin N_Neighbors: 50
Cluster Profile: LLE_KMeans_Labels 0 1 2 3 Sharpe_ratio 0.305863 -0.284141 1.668267 1.723099 Max Sharpe Cluster: 3 Max Sharpe Portfolio: ['AAPL', 'AMD', 'BAC', 'NVDA']
---------------------------------------------------------------------------------------------------- Training on 40880 records from 2017-01-03 - AAPL Predicting on 7056 record from 2018-06-15 - AAPL with 0 days margin N_Neighbors: 50
Cluster Profile: LLE_KMeans_Labels 0 1 2 3 4 5 \ Sharpe_ratio 0.108139 1.389015 1.44605 2.328814 0.270726 -0.892388 LLE_KMeans_Labels 6 Sharpe_ratio 0.52387 Max Sharpe Cluster: 3 Max Sharpe Portfolio: ['ABT', 'ADBE', 'AMZN', 'ASML', 'BA', 'CRM', 'EA', 'EL', 'FTNT', 'MA', 'MCO', 'NFLX', 'TTWO', 'V', 'VLO']
---------------------------------------------------------------------------------------------------- Training on 40880 records from 2017-04-04 - AAPL Predicting on 7056 record from 2018-09-14 - AAPL with 0 days margin N_Neighbors: 50
Cluster Profile: LLE_KMeans_Labels 0 1 2 3 Sharpe_ratio 1.233054 0.492626 1.557395 -0.095483 Max Sharpe Cluster: 2 Max Sharpe Portfolio: ['ABT', 'ADBE', 'ADSK', 'AMZN', 'ANTM', 'ASML', 'ATVI', 'BA', 'BX', 'CAT', 'CRM', 'DE', 'DG', 'EL', 'FTNT', 'GOOGL', 'HD', 'HON', 'HRL', 'KMX', 'LHX', 'MA', 'MCD', 'MCO', 'NFLX', 'NKE', 'PFE', 'QQQ', 'SPY', 'SYY', 'TMO', 'TSM', 'TTWO', 'TXN', 'UNH', 'UPRO', 'USO', 'V', 'VLO', 'VOO', 'WCN', 'WMT']
---------------------------------------------------------------------------------------------------- Training on 40880 records from 2017-07-05 - AAPL Predicting on 7056 record from 2018-12-14 - AAPL with 0 days margin N_Neighbors: 50
Cluster Profile: LLE_KMeans_Labels 0 1 2 3 Sharpe_ratio 0.485096 1.202693 -0.374799 0.275011 Max Sharpe Cluster: 1 Max Sharpe Portfolio: ['ABT', 'ADBE', 'AMZN', 'ANTM', 'AZO', 'BA', 'CRM', 'DG', 'EL', 'FTNT', 'HRL', 'LHX', 'MA', 'MCD', 'MSFT', 'NFLX', 'NKE', 'ORLY', 'PFE', 'SYY', 'TMO', 'TTWO', 'TXN', 'UNH', 'V', 'VZ', 'WCN', 'WMT']
---------------------------------------------------------------------------------------------------- Training on 40880 records from 2017-10-03 - AAPL Predicting on 7056 record from 2019-03-19 - AAPL with 0 days margin N_Neighbors: 50
Cluster Profile: LLE_KMeans_Labels 0 1 2 3 Sharpe_ratio 1.222364 -0.28973 0.357929 0.577822 Max Sharpe Cluster: 0 Max Sharpe Portfolio: ['ABT', 'ADBE', 'AMZN', 'ANTM', 'AZO', 'BA', 'CRM', 'DG', 'EL', 'FTNT', 'HRL', 'MA', 'NFLX', 'NKE', 'ORLY', 'SBUX', 'SHY', 'TMO', 'UNH', 'V', 'WMT']
---------------------------------------------------------------------------------------------------- Training on 40880 records from 2018-01-03 - AAPL Predicting on 7056 record from 2019-06-18 - AAPL with 0 days margin N_Neighbors: 50
Cluster Profile: LLE_KMeans_Labels 0 1 2 3 Sharpe_ratio 0.583431 -0.286483 0.842221 -1.021448 Max Sharpe Cluster: 2 Max Sharpe Portfolio: ['AAPL', 'ABT', 'ADBE', 'AMD', 'BX', 'CMCSA', 'D', 'DG', 'DIS', 'FTNT', 'HYG', 'JPM', 'MA', 'MSFT', 'NFLX', 'ORCL', 'PFE', 'PG', 'QQQ', 'SBUX', 'SHY', 'SPY', 'TGT', 'TMF', 'TMO', 'UNH', 'VNQ', 'VZ']
---------------------------------------------------------------------------------------------------- Training on 40880 records from 2018-04-05 - AAPL Predicting on 7056 record from 2019-09-17 - AAPL with 0 days margin N_Neighbors: 50
Cluster Profile: LLE_KMeans_Labels 0 1 2 3 4 5 \ Sharpe_ratio 0.207316 1.201184 1.111223 -0.987473 -0.294611 1.390498 LLE_KMeans_Labels 6 Sharpe_ratio 0.207961 Max Sharpe Cluster: 5 Max Sharpe Portfolio: ['AMD']
---------------------------------------------------------------------------------------------------- Training on 40880 records from 2018-07-05 - AAPL Predicting on 7056 record from 2019-12-16 - AAPL with 0 days margin N_Neighbors: 50
Cluster Profile: LLE_KMeans_Labels 0 1 2 3 Sharpe_ratio 1.007936 0.980548 -0.906097 -0.025315 Max Sharpe Cluster: 0 Max Sharpe Portfolio: ['ADSK', 'AGG', 'ANTM', 'ASML', 'AZO', 'BND', 'EFX', 'EL', 'EMLP', 'GLD', 'GOOGL', 'HD', 'HON', 'HRL', 'HSY', 'IYR', 'JNJ', 'JNK', 'KMX', 'KO', 'KXI', 'LHX', 'LQD', 'MCD', 'MCO', 'NKE', 'NOC', 'ORLY', 'PEP', 'SAP', 'SHY', 'SO', 'SYY', 'TM', 'UPS', 'URE', 'V', 'VDC', 'VHT', 'VNQI', 'VOO', 'VSS', 'WCN', 'WMT']
---------------------------------------------------------------------------------------------------- Training on 40880 records from 2018-10-03 - AAPL Predicting on 7056 record from 2020-03-18 - AAPL with 0 days margin N_Neighbors: 50
Cluster Profile: LLE_KMeans_Labels 0 1 2 3 4 5 Sharpe_ratio 1.134912 -0.785001 -1.321904 0.175203 -0.132226 -1.714059 Max Sharpe Cluster: 0 Max Sharpe Portfolio: ['AGG', 'BND', 'DG', 'GLD', 'HRL', 'HSY', 'LQD', 'SHY', 'SO', 'TMF', 'TMO', 'WMT']
---------------------------------------------------------------------------------------------------- Training on 40880 records from 2019-01-04 - AAPL Predicting on 7056 record from 2020-06-17 - AAPL with 0 days margin N_Neighbors: 50
Cluster Profile: LLE_KMeans_Labels 0 1 2 3 4 Sharpe_ratio 0.650499 0.236185 -0.688908 1.216703 -0.10201 Max Sharpe Cluster: 3 Max Sharpe Portfolio: ['AAPL', 'ABT', 'ADBE', 'AMAT', 'AMD', 'ASML', 'ATVI', 'BX', 'DG', 'FB', 'FTNT', 'GLD', 'MA', 'MSFT', 'NVDA', 'PG', 'QQQ', 'SHY', 'TGT', 'TMF', 'TMO', 'TSLA', 'TSM']
---------------------------------------------------------------------------------------------------- Training on 40880 records from 2019-04-05 - AAPL Predicting on 7056 record from 2020-09-16 - AAPL with 0 days margin N_Neighbors: 50
Cluster Profile: LLE_KMeans_Labels 0 1 2 3 4 Sharpe_ratio 0.54742 0.216992 -0.614403 1.228644 -0.411381 Max Sharpe Cluster: 3 Max Sharpe Portfolio: ['AAPL', 'ADBE', 'AMD', 'ATVI', 'BX', 'DG', 'FB', 'GDX', 'GDXJ', 'GLD', 'MSFT', 'NVDA', 'PG', 'QQQ', 'SHY', 'SLV', 'TGT', 'TMF', 'TMO', 'TSLA', 'TSM']
---------------------------------------------------------------------------------------------------- Training on 40880 records from 2019-07-08 - AAPL Predicting on 7056 record from 2020-12-15 - AAPL with 0 days margin N_Neighbors: 50
Cluster Profile: LLE_KMeans_Labels 0 1 2 3 4 5 \ Sharpe_ratio -0.346938 0.326773 0.928114 1.575786 0.486279 0.219697 LLE_KMeans_Labels 6 Sharpe_ratio -0.295894 Max Sharpe Cluster: 3 Max Sharpe Portfolio: ['AAPL', 'AMD', 'GME', 'NVDA', 'TSLA', 'TSM']
---------------------------------------------------------------------------------------------------- Training on 40880 records from 2019-10-04 - AAPL Predicting on 7056 record from 2021-03-18 - AAPL with 0 days margin N_Neighbors: 50
Cluster Profile: LLE_KMeans_Labels 0 1 2 3 Sharpe_ratio 0.368408 1.262467 1.02227 0.640281 Max Sharpe Cluster: 1 Max Sharpe Portfolio: ['AAPL', 'AMAT', 'AMD', 'F', 'M', 'MSFT', 'NVDA', 'TSLA', 'TSM']
----------------------------------------------------------------------------------------------------
Strat_Rets.cumsum().plot(c='g', label=f'Sharpe_Ratio: {round(SR,3)}', figsize=(10,4));
qs.extend_pandas()
stock = Strat_Rets.cumsum().copy()
stock.index = pd.to_datetime(stock.index)
qs.reports.metrics(stock)
Strategy ------------------ ----------- Start Period 2015-06-17 End Period 2021-06-16 Risk-Free Rate 0.0% Time in Market 100.0% Cumulative Return 453,863.93% CAGR% 306.65% Sharpe 1.83 Sortino 43.97 Sortino/√2 31.09 Max Drawdown -40.84% Longest DD Days 609 Gain/Pain Ratio 11.81 Gain/Pain (1M) 14.43 Payoff Ratio 4.11 Profit Factor 12.81 Common Sense Ratio 117.73 CPC Index 39.82 Tail Ratio 9.19 Outlier Win Ratio 11.23 Outlier Loss Ratio 5.63 MTD 2.91% 3M -0.13% 6M 6.94% YTD 6.83% 1Y 13.06% 3Y (ann.) 15.02% 5Y (ann.) 74.24% 10Y (ann.) 306.65% All-time (ann.) 306.65% Avg. Drawdown -3.82% Avg. Drawdown Days 67 Recovery Factor 11113.59 Ulcer Index 1.0